bitkeeper revision 1.1159.1.551 (42090342LHDFQZTluOIKtDxiXyfkHA)
authorakw27@labyrinth.cl.cam.ac.uk <akw27@labyrinth.cl.cam.ac.uk>
Tue, 8 Feb 2005 18:21:54 +0000 (18:21 +0000)
committerakw27@labyrinth.cl.cam.ac.uk <akw27@labyrinth.cl.cam.ac.uk>
Tue, 8 Feb 2005 18:21:54 +0000 (18:21 +0000)
Initial checkin of blktap user-land tools.  These are fairly experimental,
but a few people have asked to use them.  This checkin also includes
Christian's gnbd client library code.

Signed-off-by: andrew.warfield@cl.cam.ac.uk
27 files changed:
.rootkeys
tools/blktap/Makefile [new file with mode: 0644]
tools/blktap/README [new file with mode: 0644]
tools/blktap/blkaio.c [new file with mode: 0644]
tools/blktap/blkaiolib.c [new file with mode: 0644]
tools/blktap/blkaiolib.h [new file with mode: 0644]
tools/blktap/blkcow.c [new file with mode: 0644]
tools/blktap/blkcowgnbd.c [new file with mode: 0644]
tools/blktap/blkcowimg.c [new file with mode: 0644]
tools/blktap/blkcowlib.c [new file with mode: 0644]
tools/blktap/blkcowlib.h [new file with mode: 0644]
tools/blktap/blkdump.c [new file with mode: 0644]
tools/blktap/blkgnbd.c [new file with mode: 0644]
tools/blktap/blkgnbdlib.c [new file with mode: 0644]
tools/blktap/blkgnbdlib.h [new file with mode: 0644]
tools/blktap/blkimg.c [new file with mode: 0644]
tools/blktap/blkimglib.c [new file with mode: 0644]
tools/blktap/blkimglib.h [new file with mode: 0644]
tools/blktap/blkint.h [new file with mode: 0644]
tools/blktap/blktaplib.c [new file with mode: 0644]
tools/blktap/blktaplib.h [new file with mode: 0644]
tools/blktap/libgnbd/Makefile [new file with mode: 0644]
tools/blktap/libgnbd/gnbdtest.c [new file with mode: 0644]
tools/blktap/libgnbd/libgnbd.c [new file with mode: 0644]
tools/blktap/libgnbd/libgnbd.h [new file with mode: 0644]
tools/python/xen/xend/server/blkif.py
xen/include/public/io/blkif.h

index e3c0b4fdb03792a099e65bafd195690e31aef973..97ace4a85efee5ce8a0144e06d90c97723093bbb 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 413aa1d0oNP8HXLvfPuMe6cSroUfSA patches/linux-2.6.9/agpgart.patch
 3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
 40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Rules.mk
+4209033eUwhDBJ_bxejiv5c6gjXS4A tools/blktap/Makefile
+4209033ewLAHdhGrT_2jo3Gb_5bDcA tools/blktap/README
+4209033eX_Xw94wHaOCtnU9nOAtSJA tools/blktap/blkaio.c
+4209033egwf6LDxM2hbaqi9rRdZy4A tools/blktap/blkaiolib.c
+4209033f9yELLK85Ipo2oKjr3ickgQ tools/blktap/blkaiolib.h
+4209033fL9LcSI6LXrIp5O4axbUBLg tools/blktap/blkcow.c
+4209033fUDlFGZreIyZHdP7h7yfvuQ tools/blktap/blkcowgnbd.c
+4209033fCgZzLeMOwNBFmsp99x58ZQ tools/blktap/blkcowimg.c
+4209033frfXH6oOi9AvRz08PPAndNA tools/blktap/blkcowlib.c
+4209033fhFd_y2go9HgCF395A35xJg tools/blktap/blkcowlib.h
+4209033fHgtGpb_K16_xC9CpkjNZLw tools/blktap/blkdump.c
+4209033fm61CZG1RyKDW75V-eTZ9fg tools/blktap/blkgnbd.c
+4209033fVfa-R6MFgGcmsQHTDna4PA tools/blktap/blkgnbdlib.c
+4209033fIgDQbaHwHStHhPEDTtbqsA tools/blktap/blkgnbdlib.h
+4209033figp5JRsKsXY8rw4keRumkg tools/blktap/blkimg.c
+42090340V-8HKGlr00SyJGsE5jXC3A tools/blktap/blkimglib.c
+42090340c7pQbh0Km8zLcEqPd_3zIg tools/blktap/blkimglib.h
+42090340_mvZtozMjghPJO0qsjk4NQ tools/blktap/blkint.h
+42090340rc2q1wmlGn6HtiJAkqhtNQ tools/blktap/blktaplib.c
+42090340C-WkRPT7N3t-8Lzehzogdw tools/blktap/blktaplib.h
+42090340B3mDvcxvd9ehDHUkg46hvw tools/blktap/libgnbd/Makefile
+42090340ZWkc5Xhf9lpQmDON8HJXww tools/blktap/libgnbd/gnbdtest.c
+42090340ocMiUScJE3OpY7QNunvSbg tools/blktap/libgnbd/libgnbd.c
+42090340G5_F_EeVnPORKB0pTMGGhA tools/blktap/libgnbd/libgnbd.h
 4124b307nRyK3dhn1hAsvrY76NuV3g tools/check/Makefile
 4124b307vHLUWbfpemVefmaWDcdfag tools/check/README
 4124b307jt7T3CHysgl9LijNHSe1tA tools/check/check_brctl
diff --git a/tools/blktap/Makefile b/tools/blktap/Makefile
new file mode 100644 (file)
index 0000000..389095e
--- /dev/null
@@ -0,0 +1,100 @@
+MAJOR    = 2.0
+MINOR    = 0
+SONAME   = libblktap.so.$(MAJOR)
+
+CC       = gcc
+
+XEN_ROOT = ../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+INCLUDES += 
+
+SRCS     :=
+SRCS     += blktaplib.c
+
+CFLAGS   += -Wall
+CFLAGS   += -Werror
+CFLAGS   += -Wno-unused
+#CFLAGS   += -O3
+CFLAGS   += -g3
+CFLAGS   += -fno-strict-aliasing
+CFLAGS   += -I $(XEN_LIBXC)
+CFLAGS   += -I $(XEN_LIBXUTIL)
+CFLAGS   += $(INCLUDES) -I.
+CFLAGS   += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
+# Get gcc to generate the dependencies for us.
+CFLAGS   += -Wp,-MD,.$(@F).d
+DEPS     = .*.d
+
+OBJS     = $(patsubst %.c,%.o,$(SRCS))
+
+LIB      = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
+
+all: mk-symlinks blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd blkaio
+       $(MAKE) $(LIB)
+
+LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
+mk-symlinks:
+       [ -e xen/linux ] || mkdir -p xen/linux
+       [ -e xen/io ]    || mkdir -p xen/io
+       ( cd xen >/dev/null ; \
+         ln -sf ../$(XEN_ROOT)/xen/include/public/*.h . )
+       ( cd xen/io >/dev/null ; \
+          ln -sf ../../$(XEN_ROOT)/xen/include/public/io/*.h . )
+       ( cd xen/linux >/dev/null ; \
+         ln -sf ../../$(LINUX_ROOT)/include/asm-xen/linux-public/*.h . )
+
+install: all
+       mkdir -p $(prefix)/usr/lib
+       mkdir -p $(prefix)/usr/include
+       install -m0755 $(LIB) $(prefix)/usr/lib
+       ln -sf libblktap.so.$(MAJOR).$(MINOR) \
+                $(prefix)/usr/lib/libblktap.so.$(MAJOR)
+       ln -sf libblktap.so.$(MAJOR) $(prefix)/usr/lib/libblktap.so
+       install -m0644 blktaplib.h $(prefix)/usr/include
+
+clean:
+       rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd blkaio
+
+rpm: all
+       rm -rf staging
+       mkdir staging
+       mkdir staging/i386
+       rpmbuild --define "staging$$PWD/staging" --define '_builddir.' \
+               --define "_rpmdir$$PWD/staging" -bb rpm.spec
+       mv staging/i386/*.rpm .
+       rm -rf staging
+
+libblktap.so:
+       ln -sf libblktap.so.$(MAJOR) $@
+libblktap.so.$(MAJOR):
+       ln -sf libblktap.so.$(MAJOR).$(MINOR) $@
+libblktap.so.$(MAJOR).$(MINOR): $(OBJS)
+       $(CC) -Wl,-soname -Wl,$(SONAME) -shared -o $@ $^ -L../libxutil -lxutil -lz
+
+blkdump: $(LIB)
+       $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -l blktap blkdump.c
+
+blkcowimg: $(LIB) blkcowimg.c blkcowlib.c blkimglib.c 
+       $(CC) $(CFLAGS) -o blkcowimg -ldb -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -l blktap blkcowimg.c blkimglib.c blkcowlib.c
+
+blkcow: $(LIB) blkcow.c blkcowlib.c
+       $(CC) $(CFLAGS) -o blkcow -ldb -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -l blktap blkcow.c blkcowlib.c
+
+blkimg: $(LIB) blkimg.c blkimglib.c
+       $(CC) $(CFLAGS) -o blkimg  -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -l blktap blkimg.c blkimglib.c
+
+blkgnbd: $(LIB) blkgnbd.c blkgnbdlib.c
+       $(CC) $(CFLAGS) -o blkgnbd -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap blkgnbd.c blkgnbdlib.c libgnbd/libgnbd.a
+
+blkcowgnbd: $(LIB) blkgnbd.c blkcowlib.c blkgnbdlib.c
+       $(CC) $(CFLAGS) -o blkcowgnbd -ldb -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap blkcowgnbd.c blkgnbdlib.c blkcowlib.c libgnbd/libgnbd.a
+
+blkaio: $(LIB) blkaio.c blkaiolib.c
+       $(CC) $(CFLAGS) -o blkaio -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap blkaio.c blkaiolib.c -laio -lpthread
+
+.PHONY: TAGS clean install mk-symlinks rpm
+TAGS:
+       etags -t $(SRCS) *.h
+
+-include $(DEPS)
diff --git a/tools/blktap/README b/tools/blktap/README
new file mode 100644 (file)
index 0000000..cca9a28
--- /dev/null
@@ -0,0 +1,149 @@
+Block Tap User-level Interfaces
+Andrew Warfield
+andrew.warfield@cl.cam.ac.uk
+February 8, 2005
+
+NOTE #1: The blktap is _experimental_ code.  It works for me.  Your
+mileage may vary.  Don't use it for anything important.  Please. ;)
+
+NOTE #2: All of the interfaces here are likely to change.  This is all
+early code, and I am checking it in because others want to play with
+it.  If you use it for anything, please let me know!
+
+Overview:
+---------
+
+This directory contains a library and set of example applications for
+the block tap device.  The block tap hooks into the split block device
+interfaces above Xen allowing them to be extended.  This extension can
+be done in userspace with the help of a library.
+
+The tap can be installed either as an interposition domain in between
+a frontend and backend driver pair, or as a terminating backend, in
+which case it is responsible for serving all requests itself.
+
+There are two reasons that you might want to use the tap,
+corresponding to these configurations:
+
+ 1. To examine or modify a stream of block requests while they are
+    in-flight (e.g. to encrypt data, or add data-driven watchpoints)
+
+ 2. To prototype a new backend driver, serving requests from the tap
+    rather than passing them along to the XenLinux blkback driver.
+    (e.g. to forward block requests to a remote host)
+
+
+Interface:
+----------
+
+At the moment, the tap interface is similar in spirit to that of the
+Linux netfilter.  Requests are messages from a client (frontend)
+domain to a disk (backend) domain.  Responses are messages travelling
+back, acknowledging the completion of a request.  the library allows
+chains of functions to be attached to these events.  In addition,
+hooks may be attached to handle control messages, which signify things
+like connections from new domains.
+
+At present the control messages especially expose a lot of the
+underlying driver interfaces.  This may change in the future in order
+to simplify writing hooks.
+
+Here are the public interfaces:
+
+These allow hook functions to be chained:
+
+ void blktap_register_ctrl_hook(char *name, int (*ch)(control_msg_t *));
+ void blktap_register_request_hook(char *name, int (*rh)(blkif_request_t *));
+ void blktap_register_response_hook(char *name, int (*rh)(blkif_response_t *));
+
+This allows a response to be injected, in the case where a request has
+been removed using BLKTAP_STOLEN.
+
+ void blktap_inject_response(blkif_response_t *);
+
+These let you add file descriptors and handlers to the main poll loop:
+
+ int  blktap_attach_poll(int fd, short events, int (*func)(int));
+ void blktap_detach_poll(int fd);
+
+This starts the main poll loop:
+
+ int  blktap_listen(void);
+
+Example:
+--------
+
+blkimage.c uses an image on the local file system to serve requests to
+a domain.  Here's what it looks like:
+
+---[blkimg.c]---
+
+/* blkimg.c
+ *
+ * file-backed disk.
+ */
+
+#include "blktaplib.h"
+#include "blkimglib.h"
+
+
+int main(int argc, char *argv[])
+{
+    image_init();
+    
+    blktap_register_ctrl_hook("image_control", image_control);
+    blktap_register_request_hook("image_request", image_request);
+    blktap_listen();
+    
+    return 0;
+}
+
+----------------
+
+All of the real work is in blkimglib.c, but this illustrates the
+actual tap interface well enough.  image_control() will be called with
+all control messages.  image_request() handles requests.  As it reads
+from an on-disk image file, no requests are ever passed on to a
+backend, and so there will be no responses to process -- so there is
+nothing registered as a response hook.
+
+Other examples:
+---------------
+
+Here is a list of other examples in the directory:
+
+Things that terminate a block request stream:
+
+  blkimg    - Use a image file/device to serve requests
+  blkgnbd   - Use a remote gnbd server to serve requests
+  blkaio    - Use libaio... (DOES NOT WORK)
+  
+Things that don't:
+
+  blkdump   - Print in-flight requests.
+  blkcow    - Really inefficient copy-on-write disks using libdb to store
+              writes.
+
+There are examples of plugging these things together, for instance
+blkcowgnbd is a read-only gnbd device with copy-on-write to a local
+file.
+
+TODO:
+-----
+
+- Make session tracking work.  At the moment these generally just handle a 
+  single front-end client at a time.
+
+- Integrate with Xend.  Need to cleanly pass a image identifier in the connect
+  message.
+
+- Make an asynchronous file-io terminator.  The libaio attempt is
+  tragically stalled because mapped foreign pages make pfn_valid fail
+  (they are VM_IO), and so cannot be passed to aio as targets.  A
+  better solution may be to tear the disk interfaces out of the real
+  backend and expose them somehow.
+
+- Make CoW suck less.
+
+- Do something more along the lines of dynamic linking for the
+  plugins, so thatthey don't all need a new main().
diff --git a/tools/blktap/blkaio.c b/tools/blktap/blkaio.c
new file mode 100644 (file)
index 0000000..2549571
--- /dev/null
@@ -0,0 +1,19 @@
+/* blkaio.c
+ *
+ * libaio-backed disk.
+ */
+
+#include "blktaplib.h"
+#include "blkaiolib.h"
+
+
+int main(int argc, char *argv[])
+{
+    aio_init();
+    
+    blktap_register_ctrl_hook("aio_control", aio_control);
+    blktap_register_request_hook("aio_request", aio_request);
+    blktap_listen();
+    
+    return 0;
+}
diff --git a/tools/blktap/blkaiolib.c b/tools/blktap/blkaiolib.c
new file mode 100644 (file)
index 0000000..4538a9e
--- /dev/null
@@ -0,0 +1,489 @@
+/* blkaiolib.c
+ *
+ * file/device image-backed block device -- using linux libaio.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent.
+ *
+ * NOTE: This doesn't work.  Grrr.
+ */
+
+#define _GNU_SOURCE
+#define __USE_LARGEFILE64
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <db.h>       
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/poll.h>
+#include <unistd.h>
+#include <errno.h>
+#include <libaio.h>
+#include <pthread.h>
+#include <time.h>
+#include "blktaplib.h"
+
+//#define TMP_IMAGE_FILE_NAME "/dev/sda1"
+#define TMP_IMAGE_FILE_NAME "fc3.image"
+
+#define MAX_DOMS              1024
+#define MAX_IMGNAME_LEN        255
+#define AMORFS_DEV           61440
+#define MAX_REQUESTS            64 /* must be synced with the blkif drivers. */
+#define MAX_SEGMENTS_PER_REQ    11
+#define SECTOR_SHIFT             9
+#define MAX_AIO_REQS   (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ)
+                                                                                
+#if 1
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+           
+#if 1                                                                        
+#define ASSERT(_p) \
+    if ( !(_p) ) { printf("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif                                                                     
+
+char dbg_page[4096];
+
+typedef struct {
+    /* These need to turn into an array/rbtree for multi-disk support. */
+    int  fd;
+    u64  fsid;
+    char imgname[MAX_IMGNAME_LEN];
+    blkif_vdev_t   vdevice;
+} image_t;
+
+/* Note on pending_reqs: I assume all reqs are queued before they start to 
+ * get filled.  so count of 0 is an unused record.
+ */
+typedef struct {
+    blkif_request_t  req;
+    int              count;
+} pending_req_t;
+
+static pending_req_t    pending_list[MAX_REQUESTS];
+image_t                *images[MAX_DOMS];
+
+static io_context_t  ctx;
+static struct iocb  *iocb_free[MAX_AIO_REQS];
+static int           iocb_free_count;
+
+/* ---[ Notification mecahnism ]--------------------------------------- */
+
+enum { 
+    READ   = 0,
+    WRITE  = 1
+};
+
+static int aio_notify[2];
+static volatile int aio_listening = 0;
+
+static struct io_event aio_events[MAX_AIO_REQS];
+static int             aio_event_count = 0;
+
+/* this is commented out in libaio.h for some reason. */
+extern int io_queue_wait(io_context_t ctx, struct timespec *timeout);
+
+static void *notifier_thread(void *arg)
+{
+    int ret; 
+    int msg = 0x00feeb00;
+    
+    printf("Notifier thread started.\n");
+    for (;;) {
+        //if ((aio_listening) && ((ret = io_queue_wait(ctx, 0)) == 0)) {
+        if ((aio_listening) && 
+           ((ret = io_getevents(ctx, 1, MAX_AIO_REQS, aio_events, 0)) > 0)) {
+            aio_event_count = ret;
+            printf("[Notifying! (%d)]\n", aio_event_count);
+            aio_listening = 0;
+            write(aio_notify[WRITE], &msg, sizeof(msg));
+            fsync(aio_notify[WRITE]);
+        } else {
+            if (aio_listening)
+                printf("[io_queue_wait error! %d]\n", errno);
+            usleep(1000); /* Not ready to read. */
+        }
+    }
+}
+
+/* -------------------------------------------------------------------- */
+
+int aio_control(control_msg_t *msg)
+{
+    domid_t  domid;
+    DB      *db;
+    int      ret;
+    
+    if (msg->type != CMSG_BLKIF_BE) 
+    {
+        printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type);
+        return 0;
+    }
+    
+    switch(msg->subtype)
+    {
+    case CMSG_BLKIF_BE_CREATE:
+        if ( msg->length != sizeof(blkif_be_create_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n",
+                ((blkif_be_create_t *)msg->msg)->domid,
+                ((blkif_be_create_t *)msg->msg)->blkif_handle);
+        domid = ((blkif_be_create_t *)msg->msg)->domid;
+        if (images[domid] != NULL) {
+            printf("attempt to connect from an existing dom!\n");
+            return 0;
+        }
+        
+        images[domid] = (image_t *)malloc(sizeof(image_t));
+        if (images[domid] == NULL) {
+            printf("error allocating image record.\n");
+            return 0;
+        }
+        
+        images[domid]->fd  = -1;
+        images[domid]->fsid = 0;
+        
+        printf("Image connected.\n");
+        break;   
+        
+    case CMSG_BLKIF_BE_DESTROY:
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n",
+                ((blkif_be_destroy_t *)msg->msg)->domid,
+                ((blkif_be_destroy_t *)msg->msg)->blkif_handle);
+        
+        domid = ((blkif_be_destroy_t *)msg->msg)->domid;
+        if (images[domid] != NULL) {
+            if (images[domid]->fd != -1)
+                close( images[domid]->fd );
+            free( images[domid] );
+            images[domid] = NULL;
+        }
+        break;  
+    case CMSG_BLKIF_BE_VBD_GROW:
+    {
+        blkif_be_vbd_grow_t *grow;
+        
+        if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_GROW(d:%d,h:%d,v:%d)\n",
+                ((blkif_be_vbd_grow_t *)msg->msg)->domid,
+                ((blkif_be_vbd_grow_t *)msg->msg)->blkif_handle,
+                ((blkif_be_vbd_grow_t *)msg->msg)->vdevice);
+        printf("              Extent: sec_start: %llu sec_len: %llu, dev: %d\n",
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_start,
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_length,
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.device);
+        grow = (blkif_be_vbd_grow_t *)msg->msg;
+        domid = grow->domid;
+        if (images[domid] == NULL) {
+            printf("VBD_GROW on unconnected domain!\n");
+            return 0;
+        }
+        
+        if (grow->extent.device != AMORFS_DEV) {
+            printf("VBD_GROW on non-amorfs device!\n");
+            return 0;
+        }
+        
+        /* TODO: config support for arbitrary image files/modes. */
+        sprintf(images[domid]->imgname, TMP_IMAGE_FILE_NAME);
+        
+        images[domid]->fsid   = grow->extent.sector_start;
+        images[domid]->vdevice = grow->vdevice; 
+        images[domid]->fd = open(TMP_IMAGE_FILE_NAME, 
+                O_RDWR | O_DIRECT | O_LARGEFILE);
+        if (images[domid]->fd < 0) {
+            printf("Couldn't open image file! %d\n", errno);
+            return 0;
+        }
+        
+        printf("Image file opened. (%s)\n", images[domid]->imgname);
+        break;
+    }    
+    }
+    return 0;
+parse_error:
+    printf("Bad control message!\n");
+    return 0;
+    
+create_failed:
+    /* TODO: close the db ref. */
+    return 0;
+}    
+int aio_request(blkif_request_t *req)
+{
+    int fd;
+    u64 sector;
+    char *spage, *dpage;
+    int ret, i, idx;
+    blkif_response_t *rsp;
+    domid_t dom = ID_TO_DOM(req->id);
+    
+    if ((images[dom] == NULL) || (images[dom]->fd == -1)) {
+        printf("Data request for unknown domain!!! %d\n", dom);
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = req->operation;
+        rsp->status = BLKIF_RSP_ERROR;
+        return BLKTAP_RESPOND;
+    }
+    
+    fd = images[dom]->fd;
+    
+    switch (req->operation) 
+    {
+    case BLKIF_OP_PROBE:
+    {
+        struct stat stat;
+        vdisk_t *img_info;
+        
+        
+        /* We expect one buffer only. */
+        if ( req->nr_segments != 1 )
+            goto err;
+                                                                                
+        /* Make sure the buffer is page-sized. */
+        if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+             (blkif_last_sect (req->frame_and_sects[0]) != 7) )
+            goto err;
+
+        /* loop for multiple images would start here. */
+        
+        ret = fstat(fd, &stat);
+        if (ret != 0) {
+            printf("Couldn't stat image in PROBE!\n");
+            goto err;
+        }
+        
+        img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
+        img_info[0].device   = images[dom]->vdevice;
+        img_info[0].info     = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
+        img_info[0].capacity = (stat.st_size >> SECTOR_SHIFT);
+        
+        if (img_info[0].capacity == 0)
+            img_info[0].capacity = ((u64)1 << 63); // xend does this too.
+        
+        DPRINTF("iPROBE! device: 0x%04x capacity: %llu\n", img_info[0].device,
+                img_info[0].capacity);
+        
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = BLKIF_OP_PROBE;
+        rsp->status = 1; /* number of disks */
+        
+        return  BLKTAP_RESPOND;
+    }    
+    case BLKIF_OP_WRITE:
+    {
+        unsigned long size;
+        struct iocb *io;
+        struct iocb *ioq[MAX_SEGMENTS_PER_REQ]; 
+        
+        idx = ID_TO_IDX(req->id);
+        ASSERT(pending_list[idx].count == 0);
+        memcpy(&pending_list[idx].req, req, sizeof(*req));
+        pending_list[idx].count = req->nr_segments;
+        
+        for (i = 0; i < req->nr_segments; i++) {
+            
+            sector = req->sector_number + (8*i);
+            
+            size = blkif_last_sect (req->frame_and_sects[i]) -
+                   blkif_first_sect(req->frame_and_sects[i]) + 1;
+            
+            DPRINTF("iWRITE: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", 
+                    req->sector_number, sector, 
+                    blkif_first_sect(req->frame_and_sects[i]),
+                    blkif_last_sect (req->frame_and_sects[i]),
+                    (long)(sector << SECTOR_SHIFT));
+                        
+            spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+            spage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+            
+            /*convert size and sector to byte offsets */
+            size   <<= SECTOR_SHIFT;
+            sector <<= SECTOR_SHIFT;
+            
+            io = iocb_free[--iocb_free_count];
+            io_prep_pwrite(io, fd, spage, size, sector);
+            io->data = (void *)idx;
+            ioq[i] = io;
+        }
+        
+        ret = io_submit(ctx, req->nr_segments, ioq);
+        if (ret < 0)
+            printf("BADNESS: io_submit error! (%d)\n", errno);
+        
+        pending_list[idx].count = req->nr_segments;
+        
+        return BLKTAP_STOLEN;
+        
+    }
+    case BLKIF_OP_READ:
+    {
+        unsigned long size;
+        struct iocb *io;
+        struct iocb *ioq[MAX_SEGMENTS_PER_REQ]; 
+        
+        idx = ID_TO_IDX(req->id);
+        ASSERT(pending_list[idx].count == 0);
+        memcpy(&pending_list[idx].req, req, sizeof(*req));
+        pending_list[idx].count = req->nr_segments;
+        
+        for (i = 0; i < req->nr_segments; i++) {
+            
+            sector  = req->sector_number + (8*i);
+            
+            size = blkif_last_sect (req->frame_and_sects[i]) -
+                   blkif_first_sect(req->frame_and_sects[i]) + 1;
+            
+            dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+            dpage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+            
+            
+            DPRINTF("iREAD : sec_nr: %10llu sec: %10llu (%1lu,%1lu) "
+                    "pos: %15lu dpage: %p\n", 
+                    req->sector_number, sector, 
+                    blkif_first_sect(req->frame_and_sects[i]),
+                    blkif_last_sect (req->frame_and_sects[i]),
+                    (long)(sector << SECTOR_SHIFT), dpage);
+            
+            /*convert size and sector to byte offsets */
+            size   <<= SECTOR_SHIFT;
+            sector <<= SECTOR_SHIFT;
+            
+            io = iocb_free[--iocb_free_count];
+            
+            io_prep_pread(io, fd, dpage, size, sector);
+            io->data = (void *)idx;
+            
+            ioq[i] = io;
+        }
+        
+        ret = io_submit(ctx, req->nr_segments, ioq);
+        if (ret < 0)
+            printf("BADNESS: io_submit error! (%d)\n", errno);
+        
+        
+        return BLKTAP_STOLEN;
+        
+    }
+    }
+    
+    printf("Unknown block operation!\n");
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = req->operation;
+    rsp->status = BLKIF_RSP_ERROR;
+    return BLKTAP_RESPOND;  
+}
+
+
+int aio_pollhook(int fd)
+{
+    struct io_event *ep;
+    int n, ret, idx;
+    blkif_request_t *req;
+    blkif_response_t *rsp;
+    
+    DPRINTF("aio_hook(): \n");
+    
+    for (ep = aio_events; aio_event_count-- > 0; ep++) {
+        struct iocb *io = ep->obj;
+        idx = (int) ep->data;
+        
+        if ((idx > MAX_REQUESTS-1) || (pending_list[idx].count == 0)){
+            printf("gnbd returned a bad cookie (%u)!\n", idx);
+            break;
+        }
+        
+        if ((int)ep->res < 0) printf("aio request error! (%d,%d)\n", 
+            (int)ep->res, (int)ep->res2);
+        
+        pending_list[idx].count--;
+        iocb_free[iocb_free_count++] = io;
+        
+        if (pending_list[idx].count == 0) {
+            blkif_request_t tmp = pending_list[idx].req;
+            rsp = (blkif_response_t *)&pending_list[idx].req;
+            rsp->id = tmp.id;
+            rsp->operation = tmp.operation;
+            rsp->status = BLKIF_RSP_OKAY;
+            blktap_inject_response(rsp);
+        }
+    }
+    
+    printf("pollhook done!\n");
+    
+    read(aio_notify[READ], &idx, sizeof(idx));
+    aio_listening = 1;
+    
+    return 0;
+}
+
+/* the image library terminates the request stream. _resp is a noop. */
+int aio_response(blkif_response_t *rsp)
+{   
+    return BLKTAP_PASS;
+}
+
+void aio_init(void)
+{
+    int i, rc;
+    pthread_t p;
+    
+    for (i = 0; i < MAX_DOMS; i++)
+        images[i] = NULL;
+    
+    for (i = 0; i < MAX_REQUESTS; i++)
+        pending_list[i].count = 0; 
+    
+    memset(&ctx, 0, sizeof(ctx));
+    rc = io_queue_init(MAX_AIO_REQS, &ctx);
+    if (rc != 0) {
+        printf("queue_init failed! (%d)\n", rc);
+        exit(0);
+    }
+    
+    for (i=0; i<MAX_AIO_REQS; i++) {
+        if (!(iocb_free[i] = (struct iocb *)malloc(sizeof(struct iocb)))) {
+            printf("error allocating iocb array\n");
+            exit(0);
+        }
+        iocb_free_count = i;
+    }
+    
+    rc = pipe(aio_notify);
+    if (rc != 0) {
+        printf("pipe failed! (%d)\n", errno);
+        exit(0);
+    }
+    
+    rc = pthread_create(&p, NULL, notifier_thread, NULL);
+    if (rc != 0) {
+        printf("pthread_create failed! (%d)\n", errno);
+        exit(0);
+    }
+    
+    aio_listening = 1;
+    
+    blktap_attach_poll(aio_notify[READ], POLLIN, aio_pollhook);
+}
+
diff --git a/tools/blktap/blkaiolib.h b/tools/blktap/blkaiolib.h
new file mode 100644 (file)
index 0000000..7e26dae
--- /dev/null
@@ -0,0 +1,16 @@
+/* blkaiolib.h
+ *
+ * aio image-backed block device.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent.
+ */
+
+int aio_control(control_msg_t *msg);
+int aio_request(blkif_request_t *req);
+int aio_response(blkif_response_t *rsp); /* noop */
+void aio_init(void);
diff --git a/tools/blktap/blkcow.c b/tools/blktap/blkcow.c
new file mode 100644 (file)
index 0000000..82f9335
--- /dev/null
@@ -0,0 +1,31 @@
+/* blkcow.c
+ *
+ * copy on write a block device.  in a really inefficient way.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * This uses whatever backend the tap is attached to as the read-only
+ * underlay -- for the moment.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent,
+ * the cow plugin uses this to identify a unique overlay.
+ */
+
+#include "blktaplib.h"
+#include "blkcowlib.h"
+
+
+int main(int argc, char *argv[])
+{
+    cow_init();
+    
+    blktap_register_ctrl_hook("cow_control", cow_control);
+    blktap_register_request_hook("cow_request", cow_request);
+    blktap_register_response_hook("cow_response", cow_response);
+    blktap_listen();
+    
+    return 0;
+}
diff --git a/tools/blktap/blkcowgnbd.c b/tools/blktap/blkcowgnbd.c
new file mode 100644 (file)
index 0000000..81f9bad
--- /dev/null
@@ -0,0 +1,24 @@
+/* blkcowgnbd.c
+ *
+ * gnbd-backed cow.
+ */
+
+#include "blktaplib.h"
+#include "blkcowlib.h"
+#include "blkgnbdlib.h"
+
+
+int main(int argc, char *argv[])
+{
+    cow_init();
+    gnbd_init();
+    
+    blktap_register_ctrl_hook("cow_control", cow_control);
+    blktap_register_ctrl_hook("gnbd_control", gnbd_control);
+    blktap_register_request_hook("cow_request", cow_request);
+    blktap_register_request_hook("gnbd_request", gnbd_request);
+    blktap_register_response_hook("cow_response", cow_response);
+    blktap_listen();
+    
+    return 0;
+}
diff --git a/tools/blktap/blkcowimg.c b/tools/blktap/blkcowimg.c
new file mode 100644 (file)
index 0000000..40aa1f8
--- /dev/null
@@ -0,0 +1,24 @@
+/* blkcowimg.c
+ *
+ * file-backed cow.
+ */
+
+#include "blktaplib.h"
+#include "blkcowlib.h"
+#include "blkimglib.h"
+
+
+int main(int argc, char *argv[])
+{
+    cow_init();
+    image_init();
+    
+    blktap_register_ctrl_hook("cow_control", cow_control);
+    blktap_register_ctrl_hook("image_control", image_control);
+    blktap_register_request_hook("cow_request", cow_request);
+    blktap_register_request_hook("image_request", image_request);
+    blktap_register_response_hook("cow_response", cow_response);
+    blktap_listen();
+    
+    return 0;
+}
diff --git a/tools/blktap/blkcowlib.c b/tools/blktap/blkcowlib.c
new file mode 100644 (file)
index 0000000..3518b4f
--- /dev/null
@@ -0,0 +1,380 @@
+/* blkcowlib.c
+ *
+ * copy on write a block device.  in a really inefficient way.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * This uses whatever backend the tap is attached to as the read-only
+ * underlay -- for the moment.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent,
+ * the cow plugin uses this to identify a unique overlay.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <db.h>
+#include "blktaplib.h"
+
+#define MAX_DOMS        1024
+#define MAX_DBNAME_LEN   255
+#define AMORFS_DEV     61440
+#define MAX_REQUESTS      64 /* must be synced with the blkif drivers. */
+                                                                                
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+    
+/* Berkeley db has different params for open() after 4.1 */
+#ifndef DB_VERSION_MAJOR
+# define DB_VERSION_MAJOR 1
+#endif /* DB_VERSION_MAJOR */
+#ifndef DB_VERSION_MINOR
+# define DB_VERSION_MINOR 0
+#endif /* DB_VERSION_MINOR */
+
+typedef struct {
+    DB   *db;
+    u64  fsid;
+    char dbname[MAX_DBNAME_LEN];
+} cow_t;
+
+cow_t           *cows[MAX_DOMS];
+blkif_request_t *reread_list[MAX_REQUESTS];
+
+int cow_control(control_msg_t *msg)
+{
+    domid_t  domid;
+    DB      *db;
+    int      ret;
+    
+    if (msg->type != CMSG_BLKIF_BE) 
+    {
+        printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type);
+        return 0;
+    }
+    
+    switch(msg->subtype)
+    {
+    case CMSG_BLKIF_BE_CREATE:
+        if ( msg->length != sizeof(blkif_be_create_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n",
+                ((blkif_be_create_t *)msg->msg)->domid,
+                ((blkif_be_create_t *)msg->msg)->blkif_handle);
+        domid = ((blkif_be_create_t *)msg->msg)->domid;
+        if (cows[domid] != NULL) {
+            printf("attempt to connect from an existing dom!\n");
+            return 0;
+        }
+        
+        cows[domid] = (cow_t *)malloc(sizeof(cow_t));
+        if (cows[domid] == NULL) {
+            printf("error allocating cow.\n");
+            return 0;
+        }
+        
+        cows[domid]->db   = NULL;
+        cows[domid]->fsid = 0;
+        
+        printf("COW connected.\n");
+        break;   
+        
+    case CMSG_BLKIF_BE_DESTROY:
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n",
+                ((blkif_be_destroy_t *)msg->msg)->domid,
+                ((blkif_be_destroy_t *)msg->msg)->blkif_handle);
+        
+        domid = ((blkif_be_destroy_t *)msg->msg)->domid;
+        if (cows[domid] != NULL) {
+            if (cows[domid]->db != NULL)
+                cows[domid]->db->close(cows[domid]->db, 0);
+            free(cows[domid]);
+            cows[domid] = NULL;
+        }
+        break;  
+    case CMSG_BLKIF_BE_VBD_GROW:
+    {
+        blkif_be_vbd_grow_t *grow;
+        
+        if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_GROW(d:%d,h:%d,v:%d)\n",
+                ((blkif_be_vbd_grow_t *)msg->msg)->domid,
+                ((blkif_be_vbd_grow_t *)msg->msg)->blkif_handle,
+                ((blkif_be_vbd_grow_t *)msg->msg)->vdevice);
+        printf("              Extent: sec_start: %llu sec_len: %llu, dev: %d\n",
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_start,
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_length,
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.device);
+        grow = (blkif_be_vbd_grow_t *)msg->msg;
+        domid = grow->domid;
+        if (cows[domid] == NULL) {
+            printf("VBD_GROW on unconnected domain!\n");
+            return 0;
+        }
+        
+        if (grow->extent.device != AMORFS_DEV) {
+            printf("VBD_GROW on non-amorfs device!\n");
+            return 0;
+        }
+        
+        sprintf(&cows[domid]->dbname[0], "%020llu.db",
+                grow->extent.sector_start);
+        
+        cows[domid]->fsid = grow->extent.sector_start;
+            
+        if ((ret = db_create(&db, NULL, 0)) != 0) {
+            fprintf(stderr, "db_create: %s\n", db_strerror(ret));
+            return 0;
+        }
+        
+        
+#if DB_VERSION_MAJOR < 4 || (DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR < 1)
+
+        if ((ret = db->open( db, cows[domid]->dbname, NULL, DB_BTREE, 
+                DB_CREATE, 0664)) != 0) {
+            
+#else /* DB_VERSION >= 4.1 */
+        
+        if ((ret = db->open( db, NULL, cows[domid]->dbname, NULL, DB_BTREE, 
+                DB_CREATE, 0664)) != 0) {
+            
+#endif /* DB_VERSION < 4.1 */
+
+            db->err(db, ret, "%s", cows[domid]->dbname);
+            goto create_failed;
+        }
+        cows[domid]->db = db;
+        printf("Overlay db opened. (%s)\n", cows[domid]->dbname);
+        break;
+    }    
+    }
+    return 0;
+parse_error:
+    printf("Bad control message!\n");
+    return 0;
+    
+create_failed:
+    /* TODO: close the db ref. */
+    return 0;
+}    
+int cow_request(blkif_request_t *req)
+{
+    DB *db;
+    DBT key, data;
+    u64 sector;
+    char *spage, *dpage;
+    int ret, i, idx;
+    blkif_response_t *rsp;
+    domid_t dom = ID_TO_DOM(req->id);
+    
+    if ((cows[dom] == NULL) || (cows[dom]->db == NULL)) {
+        printf("Data request for unknown domain!!! %d\n", dom);
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = req->operation;
+        rsp->status = BLKIF_RSP_ERROR;
+        return BLKTAP_RESPOND;
+    }
+    
+    db = cows[dom]->db;
+    
+    switch (req->operation) 
+    {
+    case BLKIF_OP_PROBE:
+/* debug -- delete */
+idx = ID_TO_IDX(req->id);
+reread_list[idx] = (blkif_request_t *)malloc(sizeof(*req));
+memcpy(reread_list[idx], req, sizeof(*req));
+        return  BLKTAP_PASS;
+        
+    case BLKIF_OP_WRITE:
+        for (i = 0; i < req->nr_segments; i++) {
+            memset(&key, 0, sizeof(key));
+           memset(&data, 0, sizeof(data));
+            
+            sector = req->sector_number + (8*i);
+            key.data = &sector;
+            key.size = sizeof(sector);
+            
+            spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+            data.data = spage;
+            data.size = PAGE_SIZE;
+            
+            
+            DPRINTF("cWRITE: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", 
+                    req->sector_number, sector, 
+                    blkif_first_sect(req->frame_and_sects[i]),
+                    blkif_last_sect (req->frame_and_sects[i]),
+                    (long)(sector << 9));
+            
+            if ((ret = db->put(db, NULL, &key, &data, 0)) == 0)
+                DPRINTF("db: %lld: key stored.\n", *((u64 *)key.data));
+            else {
+                db->err(db, ret, "DB->put");
+                goto err;
+            }
+        }
+        
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = BLKIF_OP_WRITE;
+        rsp->status = BLKIF_RSP_OKAY;
+        
+        return BLKTAP_RESPOND;
+
+    case BLKIF_OP_READ:
+        for (i = 0; i < req->nr_segments; i++) {
+            memset(&key, 0, sizeof(key));
+           memset(&data, 0, sizeof(data));
+            
+            sector = req->sector_number + (8*i);
+            key.data = &sector;
+            key.size = sizeof(sector);
+            
+            DPRINTF("cREAD: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", 
+                    req->sector_number, sector, 
+                    blkif_first_sect(req->frame_and_sects[i]),
+                    blkif_last_sect (req->frame_and_sects[i]),
+                    (long)(sector << 9));
+
+            if ((ret = db->get(db, NULL, &key, &data, 0)) == 0) {
+                DPRINTF("db: %llu: key retrieved (req).\n",
+                    *((u64 *)key.data));
+                
+                dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+                spage = data.data;
+                memcpy(dpage, spage, PAGE_SIZE);
+
+            } else if (ret == DB_NOTFOUND) {
+                idx = ID_TO_IDX(req->id);
+                if (idx > MAX_REQUESTS) {
+                    printf("Bad index!\n");
+                    goto err;
+                }
+                if (reread_list[idx] != NULL) {
+                    printf("Dupe index!\n");
+                    goto err;
+                }
+                reread_list[idx] = (blkif_request_t *)malloc(sizeof(*req));
+                memcpy(reread_list[idx], req, sizeof(*req));
+                return BLKTAP_PASS;
+            } else {
+                db->err(db, ret, "DB->get");
+                goto err;
+            }
+        }
+
+
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = BLKIF_OP_READ;
+        rsp->status = BLKIF_RSP_OKAY;
+        return BLKTAP_RESPOND;
+    }
+    
+    printf("Unknow block operation!\n");
+    return BLKTAP_PASS;
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = req->operation;
+    rsp->status = BLKIF_RSP_ERROR;
+    return BLKTAP_RESPOND;  
+}
+
+int cow_response(blkif_response_t *rsp)
+{   
+    blkif_request_t *req;
+    int i, ret;
+    DB *db;
+    DBT key, data;
+    u64 sector;
+    char *spage, *dpage;
+    int idx = ID_TO_IDX(rsp->id);
+    domid_t dom;
+    
+    /* don't touch erroring responses. */
+    if (rsp->status == BLKIF_RSP_ERROR)
+        return BLKTAP_PASS;
+    
+    if ((rsp->operation == BLKIF_OP_READ) && (reread_list[idx] != NULL))
+    {
+        req = reread_list[idx];
+        dom = ID_TO_DOM(req->id);
+
+        if ((cows[dom] == NULL) || (cows[dom]->db == NULL)) {
+            printf("Response from unknown domain!!! Very badness! %d\n", dom);
+            return BLKTAP_PASS;
+        }
+    
+        db = cows[dom]->db;
+        
+        for (i = 0; i < req->nr_segments; i++) {
+            memset(&key, 0, sizeof(key));
+           memset(&data, 0, sizeof(data));
+            
+            sector = req->sector_number + (8*i);
+            key.data = &sector;
+            key.size = sizeof(sector);
+            
+            if ((ret = db->get(db, NULL, &key, &data, 0)) == 0) {
+                printf("db: %llu: key retrieved (rsp).\n",
+                    *((u64 *)key.data));
+                
+                dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+                spage = data.data;
+                memcpy(dpage, spage, PAGE_SIZE);
+
+            } else if (ret == DB_NOTFOUND) {
+                continue; /* We read this from disk. */
+            } else {
+                db->err(db, ret, "DB->get");
+                goto err;
+            }
+        }
+        free(reread_list[idx]);
+        reread_list[idx] = NULL;
+    }
+    
+    if (rsp->operation == BLKIF_OP_PROBE) {
+        
+        vdisk_t *img_info;
+        
+        req = reread_list[idx];
+        img_info = (vdisk_t *)(char *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
+        for (i =0; i < rsp->status; i++) 
+            printf("PROBE (%d) device: 0x%04x capacity: %llu, info: 0x%04x\n", 
+                    i,
+                    img_info[0].device,
+                    img_info[0].capacity,
+                    img_info[0].info);
+        free(reread_list[idx]);
+        reread_list[idx] = NULL;
+    }
+    
+err:
+    return BLKTAP_PASS;
+}
+
+void cow_init(void)
+{
+    int i;
+    
+    for (i = 0; i < MAX_DOMS; i++)
+        cows[i] = NULL;
+    
+    for (i = 0; i < MAX_REQUESTS; i++)
+        reread_list[MAX_REQUESTS] = NULL;
+}
+
diff --git a/tools/blktap/blkcowlib.h b/tools/blktap/blkcowlib.h
new file mode 100644 (file)
index 0000000..e6bd7a5
--- /dev/null
@@ -0,0 +1,14 @@
+/* blkcowlib.h
+ *
+ * copy on write a block device.  in a really inefficient way.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * public interfaces to the CoW tap.
+ *
+ */
+int  cow_control  (control_msg_t *msg);
+int  cow_request  (blkif_request_t *req);
+int  cow_response (blkif_response_t *rsp);
+void cow_init     (void);
diff --git a/tools/blktap/blkdump.c b/tools/blktap/blkdump.c
new file mode 100644 (file)
index 0000000..f7cde9d
--- /dev/null
@@ -0,0 +1,151 @@
+/* blkdump.c
+ *
+ * show a running trace of block requests as they fly by.
+ * 
+ * (c) 2004 Andrew Warfield.
+ */
+#include <stdio.h>
+#include "blktaplib.h"
+int control_print(control_msg_t *msg)
+{
+    if (msg->type != CMSG_BLKIF_BE) 
+    {
+        printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type);
+        return 0;
+    }
+    
+    switch(msg->subtype)
+    {
+    case CMSG_BLKIF_BE_CREATE:
+        if ( msg->length != sizeof(blkif_be_create_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n",
+                ((blkif_be_create_t *)msg->msg)->domid,
+                ((blkif_be_create_t *)msg->msg)->blkif_handle);
+        break; 
+    case CMSG_BLKIF_BE_DESTROY:
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n",
+                ((blkif_be_destroy_t *)msg->msg)->domid,
+                ((blkif_be_destroy_t *)msg->msg)->blkif_handle);
+        break;   
+    case CMSG_BLKIF_BE_CONNECT:
+        if ( msg->length != sizeof(blkif_be_connect_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_CONNECT(d:%d,h:%d)\n",
+                ((blkif_be_connect_t *)msg->msg)->domid,
+                ((blkif_be_connect_t *)msg->msg)->blkif_handle);
+        break;        
+    case CMSG_BLKIF_BE_DISCONNECT:
+        if ( msg->length != sizeof(blkif_be_disconnect_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_DISCONNECT(d:%d,h:%d)\n",
+                ((blkif_be_disconnect_t *)msg->msg)->domid,
+                ((blkif_be_disconnect_t *)msg->msg)->blkif_handle);
+        break;     
+    case CMSG_BLKIF_BE_VBD_CREATE:
+        if ( msg->length != sizeof(blkif_be_vbd_create_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_CREATE(d:%d,h:%d,v:%d)\n",
+                ((blkif_be_vbd_create_t *)msg->msg)->domid,
+                ((blkif_be_vbd_create_t *)msg->msg)->blkif_handle,
+                ((blkif_be_vbd_create_t *)msg->msg)->vdevice);
+        break;
+    case CMSG_BLKIF_BE_VBD_DESTROY:
+        if ( msg->length != sizeof(blkif_be_vbd_destroy_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_DESTROY(d:%d,h:%d,v:%d)\n",
+                ((blkif_be_vbd_destroy_t *)msg->msg)->domid,
+                ((blkif_be_vbd_destroy_t *)msg->msg)->blkif_handle,
+                ((blkif_be_vbd_destroy_t *)msg->msg)->vdevice);
+        break;
+    case CMSG_BLKIF_BE_VBD_GROW:
+        if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_GROW(d:%d,h:%d,v:%d)\n",
+                ((blkif_be_vbd_grow_t *)msg->msg)->domid,
+                ((blkif_be_vbd_grow_t *)msg->msg)->blkif_handle,
+                ((blkif_be_vbd_grow_t *)msg->msg)->vdevice);
+        printf("              Extent: sec_start: %llu sec_len: %llu, dev: %d\n",
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_start,
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_length,
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.device);
+        break;
+    case CMSG_BLKIF_BE_VBD_SHRINK:
+        if ( msg->length != sizeof(blkif_be_vbd_shrink_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_SHRINK(d:%d,h:%d,v:%d)\n",
+                ((blkif_be_vbd_shrink_t *)msg->msg)->domid,
+                ((blkif_be_vbd_shrink_t *)msg->msg)->blkif_handle,
+                ((blkif_be_vbd_shrink_t *)msg->msg)->vdevice);
+        break;
+    default:
+        goto parse_error;
+    }
+   
+    return 0; 
+      
+parse_error:
+    printf("[CONTROL_MSG] Bad message type or length!\n");
+    return 0;
+}
+int request_print(blkif_request_t *req)
+{
+    int i;
+    unsigned long fas;
+    
+    if ( req->operation == BLKIF_OP_PROBE ) {
+        printf("[%2u:%2u<%s]\n", ID_TO_DOM(req->id), ID_TO_IDX(req->id),
+                blkif_op_name[req->operation]);
+        return BLKTAP_PASS;
+    } else {
+        printf("[%2u:%2u<%5s] (nr_segs: %03u, dev: %03u, %010llu)\n", 
+                ID_TO_DOM(req->id), ID_TO_IDX(req->id), 
+                blkif_op_name[req->operation], 
+                req->nr_segments, req->device, 
+                req->sector_number);
+        
+        
+        for (i=0; i < req->nr_segments; i++) {
+            fas = req->frame_and_sects[i];
+            printf("              (pf: 0x%8lx start: %lu stop: %lu)\n",
+                    (fas & PAGE_MASK),
+                    blkif_first_sect(fas),
+                    blkif_last_sect(fas)
+                    );
+        }
+            
+    }
+    
+    return BLKTAP_PASS;
+}
+
+int response_print(blkif_response_t *rsp)
+{   
+    if ( rsp->operation == BLKIF_OP_PROBE ) {
+        printf("[%2u:%2u>%s]\n", ID_TO_DOM(rsp->id), ID_TO_IDX(rsp->id),
+                blkif_op_name[rsp->operation]);
+        return BLKTAP_PASS;
+    } else {
+        printf("[%2u:%2u>%5s] (status: %d)\n", 
+                ID_TO_DOM(rsp->id), ID_TO_IDX(rsp->id), 
+                blkif_op_name[rsp->operation], 
+                rsp->status);
+            
+    }
+    return BLKTAP_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+    blktap_register_ctrl_hook("control_print", control_print);
+    blktap_register_request_hook("request_print", request_print);
+    blktap_register_response_hook("response_print", response_print);
+    blktap_listen();
+    
+    return 0;
+}
diff --git a/tools/blktap/blkgnbd.c b/tools/blktap/blkgnbd.c
new file mode 100644 (file)
index 0000000..6a6bd67
--- /dev/null
@@ -0,0 +1,19 @@
+/* blkgnbd.c
+ *
+ * gnbd-backed disk.
+ */
+
+#include "blktaplib.h"
+#include "blkgnbdlib.h"
+
+
+int main(int argc, char *argv[])
+{
+    gnbd_init();
+    
+    blktap_register_ctrl_hook("gnbd_control", gnbd_control);
+    blktap_register_request_hook("gnbd_request", gnbd_request);
+    blktap_listen();
+    
+    return 0;
+}
diff --git a/tools/blktap/blkgnbdlib.c b/tools/blktap/blkgnbdlib.c
new file mode 100644 (file)
index 0000000..6eeb49c
--- /dev/null
@@ -0,0 +1,471 @@
+/* blkgnbdlib.c
+ *
+ * gnbd image-backed block device.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <db.h>       
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/poll.h>
+#include "blktaplib.h"
+#include "libgnbd/libgnbd.h"
+
+#define GNBD_SERVER  "skirmish.cl.cam.ac.uk"
+#define GNBD_CLIENT  "pengi-0.xeno.cl.cam.ac.uk"
+#define GNBD_MOUNT   "fc2_akw27"
+#define GNBD_PORT    0x38e7
+
+#define MAX_DOMS        1024
+#define MAX_IMGNAME_LEN  255
+#define AMORFS_DEV     61440
+#define MAX_REQUESTS      64 /* must be synced with the blkif drivers. */
+#define SECTOR_SHIFT       9
+                                                                                
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+        
+#if 1                                                                        
+#define ASSERT(_p) \
+    if ( !(_p) ) { printf("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif
+
+#define GH_DISCONNECTED 0
+#define GH_PROBEWAITING 1
+#define GH_CONNECTED    2
+
+typedef struct {
+    /* These need to turn into an array/rbtree for multi-disk support. */
+    struct gnbd_handle *gh;
+    int          gh_state;
+    int          probe_idx; /* This really needs cleaning up after hotos. */
+    int          fd;
+    u64          fsid;
+    char         gnbdname[MAX_IMGNAME_LEN];
+    blkif_vdev_t vdevice;
+} gnbd_t;
+
+/* Note on pending_reqs: I assume all reqs are queued before they start to 
+ * get filled.  so count of 0 is an unused record.
+ */
+typedef struct {
+    blkif_request_t  req;
+    int              count;
+} pending_req_t;
+
+static gnbd_t          *gnbds[MAX_DOMS];
+static pending_req_t    pending_list[MAX_REQUESTS];
+static int              pending_count = 0; /* debugging */
+
+
+gnbd_t *get_gnbd_by_fd(int fd)
+{
+    /* this is a linear scan for the moment.  nees to be cleaned up for
+       multi-disk support. */
+    
+    int i;
+    
+    for (i=0; i< MAX_DOMS; i++) 
+        if ((gnbds[i] != NULL) && (gnbds[i]->fd == fd))
+            return gnbds[i];
+    
+    return NULL;
+}
+
+int gnbd_pollhook(int fd);
+
+int gnbd_control(control_msg_t *msg)
+{
+    domid_t  domid;
+    DB      *db;
+    int      ret;
+    
+    if (msg->type != CMSG_BLKIF_BE) 
+    {
+        printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type);
+        return 0;
+    }
+    
+    switch(msg->subtype)
+    {
+    case CMSG_BLKIF_BE_CREATE:
+        if ( msg->length != sizeof(blkif_be_create_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n",
+                ((blkif_be_create_t *)msg->msg)->domid,
+                ((blkif_be_create_t *)msg->msg)->blkif_handle);
+        domid = ((blkif_be_create_t *)msg->msg)->domid;
+        if (gnbds[domid] != NULL) {
+            printf("attempt to connect from an existing dom!\n");
+            return 0;
+        }
+        
+        gnbds[domid] = (gnbd_t *)malloc(sizeof(gnbd_t));
+        if (gnbds[domid] == NULL) {
+            printf("error allocating gnbd record.\n");
+            return 0;
+        }
+        
+        gnbds[domid]->gh  = NULL;
+        gnbds[domid]->fsid = 0;
+        
+        break;   
+        
+    case CMSG_BLKIF_BE_DESTROY:
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n",
+                ((blkif_be_destroy_t *)msg->msg)->domid,
+                ((blkif_be_destroy_t *)msg->msg)->blkif_handle);
+        
+        domid = ((blkif_be_destroy_t *)msg->msg)->domid;
+        if (gnbds[domid] != NULL) {
+            if (gnbds[domid]->gh != NULL) {
+                blktap_detach_poll(gnbds[domid]->fd);
+                free(gnbds[domid]->gh); /* XXX: Need a gnbd close call! */;
+            }
+            free( gnbds[domid] );
+            gnbds[domid] = NULL;
+        }
+        break;  
+    case CMSG_BLKIF_BE_VBD_GROW:
+    {
+        blkif_be_vbd_grow_t *grow;
+        
+        if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_GROW(d:%d,h:%d,v:%d)\n",
+                ((blkif_be_vbd_grow_t *)msg->msg)->domid,
+                ((blkif_be_vbd_grow_t *)msg->msg)->blkif_handle,
+                ((blkif_be_vbd_grow_t *)msg->msg)->vdevice);
+        printf("              Extent: sec_start: %llu sec_len: %llu, dev: %d\n",
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_start,
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_length,
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.device);
+        grow = (blkif_be_vbd_grow_t *)msg->msg;
+        domid = grow->domid;
+        if (gnbds[domid] == NULL) {
+            printf("VBD_GROW on unconnected domain!\n");
+            return 0;
+        }
+        
+        if (grow->extent.device != AMORFS_DEV) {
+            printf("VBD_GROW on non-amorfs device!\n");
+            return 0;
+        }
+        
+        /* TODO: config support for arbitrary gnbd files/modes. */
+        sprintf(gnbds[domid]->gnbdname, GNBD_MOUNT);
+        
+        gnbds[domid]->fsid   = grow->extent.sector_start;
+        gnbds[domid]->vdevice = grow->vdevice; 
+        gnbds[domid]->gh_state = GH_DISCONNECTED;
+        gnbds[domid]->gh = gnbd_setup(GNBD_SERVER, GNBD_PORT, 
+            gnbds[domid]->gnbdname, GNBD_CLIENT);
+        if (gnbds[domid]->gh == NULL) { 
+            printf("Couldn't connect to gnbd mount!!\n");
+            return 0;
+        }
+        gnbds[domid]->fd = gnbd_fd(gnbds[domid]->gh);
+        blktap_attach_poll(gnbds[domid]->fd, POLLIN, gnbd_pollhook);
+        
+        printf("gnbd mount connected. (%s)\n", gnbds[domid]->gnbdname);
+        break;
+    }    
+    }
+    return 0;
+parse_error:
+    printf("Bad control message!\n");
+    return 0;
+    
+create_failed:
+    /* TODO: close the db ref. */
+    return 0;
+}    
+static int gnbd_blkif_probe(blkif_request_t *req, gnbd_t *gnbd)
+{
+    int fd;
+    struct stat stat;
+    vdisk_t *gnbd_info;
+    blkif_response_t *rsp;
+
+    /* We expect one buffer only. */
+    if ( req->nr_segments != 1 )
+        goto err;
+
+    /* Make sure the buffer is page-sized. */
+    if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+         (blkif_last_sect (req->frame_and_sects[0]) != 7) )
+        goto err;
+
+    /* loop for multiple gnbds would start here. */
+
+    gnbd_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
+    gnbd_info[0].device   = gnbd->vdevice;
+    gnbd_info[0].info     = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
+    gnbd_info[0].capacity = gnbd_sectors(gnbd->gh);
+
+    printf("[SECTORS] %llu", gnbd_info[0].capacity);
+
+    //if (gnbd_info[0].capacity == 0)
+    //    gnbd_info[0].capacity = ((u64)1 << 63); // xend does this too.
+
+    DPRINTF("iPROBE! device: 0x%04x capacity: %llu\n", gnbd_info[0].device,
+            gnbd_info[0].capacity);
+
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = BLKIF_OP_PROBE;
+    rsp->status = 1; /* number of disks */
+
+    return  BLKTAP_RESPOND;
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = req->operation;
+    rsp->status = BLKIF_RSP_ERROR;
+    return BLKTAP_RESPOND;  
+}
+
+int gnbd_request(blkif_request_t *req)
+{
+    struct gnbd_handle *gh;
+    u64 sector;
+    char *spage, *dpage;
+    int ret, i, idx;
+    blkif_response_t *rsp;
+    domid_t dom = ID_TO_DOM(req->id);
+    
+    if ((gnbds[dom] == NULL) || (gnbds[dom]->gh == NULL)) {
+        printf("Data request for unknown domain!!! %d\n", dom);
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = req->operation;
+        rsp->status = BLKIF_RSP_ERROR;
+        return BLKTAP_RESPOND;
+    }
+    
+    gh = gnbds[dom]->gh;
+    
+    switch (req->operation) 
+    {
+    case BLKIF_OP_PROBE:
+    {
+        printf("PROBE!\n");
+        if ( gnbds[dom]->gh_state == GH_PROBEWAITING ) {
+            printf("Already have a PROBE outstanding!\n");
+            goto err;
+        }
+        
+        if ( gnbds[dom]->gh_state == GH_DISCONNECTED )
+        {
+            /* need to defer until we are connected. */
+            printf("Deferring PROBE!\n");
+            idx = ID_TO_IDX(req->id);
+            memcpy(&pending_list[idx].req, req, sizeof(*req));
+            ASSERT(pending_list[idx].count == 0);
+            pending_list[idx].count = 1;
+            
+            gnbds[dom]->probe_idx = idx;
+            gnbds[dom]->gh_state  = GH_PROBEWAITING;
+
+            return BLKTAP_STOLEN;
+        }
+            
+        
+        return gnbd_blkif_probe(req, gnbds[dom]);
+    }    
+    case BLKIF_OP_WRITE:
+    {
+        unsigned long size;
+        
+        idx = ID_TO_IDX(req->id);
+        ASSERT(pending_list[idx].count == 0);
+        memcpy(&pending_list[idx].req, req, sizeof(*req));
+        pending_list[idx].count = req->nr_segments;
+        pending_count++; /* dbg */
+        
+        for (i = 0; i < req->nr_segments; i++) {
+            
+            sector = req->sector_number + (8*i);
+            
+            size = blkif_last_sect (req->frame_and_sects[i]) -
+                   blkif_first_sect(req->frame_and_sects[i]) + 1;
+            
+            DPRINTF("iWRITE: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", 
+                    req->sector_number, sector, 
+                    blkif_first_sect(req->frame_and_sects[i]),
+                    blkif_last_sect (req->frame_and_sects[i]),
+                    (long)(sector << SECTOR_SHIFT));
+                        
+            spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+            spage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+            
+            ret = gnbd_write(gh, sector, size, spage, (unsigned long)idx);
+            if (ret) {
+                printf("gnbd error on WRITE\n");
+                goto err;
+            }
+        }
+//printf("[WR] < %lu\n", (unsigned long)idx);
+        
+        return BLKTAP_STOLEN;
+    }
+    case BLKIF_OP_READ:
+    {
+        unsigned long size;
+        
+        idx = ID_TO_IDX(req->id);
+        ASSERT(pending_list[idx].count == 0);
+        memcpy(&pending_list[idx].req, req, sizeof(*req));
+        pending_list[idx].count = req->nr_segments;
+        pending_count++; /* dbg */
+            
+        for (i = 0; i < req->nr_segments; i++) {
+            
+            sector  = req->sector_number + (8*i);
+            
+            size = blkif_last_sect (req->frame_and_sects[i]) -
+                   blkif_first_sect(req->frame_and_sects[i]) + 1;
+            
+            DPRINTF("iREAD : sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", 
+                    req->sector_number, sector, 
+                    blkif_first_sect(req->frame_and_sects[i]),
+                    blkif_last_sect (req->frame_and_sects[i]),
+                    (long)(sector << SECTOR_SHIFT));
+            
+            dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+            dpage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+            
+            ret = gnbd_read(gh, sector, size, dpage, (unsigned long)idx);
+            if (ret) {
+                printf("gnbd error on READ\n");
+                goto err;
+            }
+            
+        }
+//printf("[RD] < %lu\n", (unsigned long)idx);
+        
+        return BLKTAP_STOLEN;
+    }
+    }
+    
+    printf("Unknown block operation!\n");
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = req->operation;
+    rsp->status = BLKIF_RSP_ERROR;
+    return BLKTAP_RESPOND;  
+}
+
+/* the gnbd library terminates the request stream. _resp is a noop. */
+int gnbd_response(blkif_response_t *rsp)
+{   
+    return BLKTAP_PASS;
+}
+
+int gnbd_pollhook(int fd)
+{
+    int err;
+    struct gnbd_handle *gh;
+    blkif_request_t *req;
+    blkif_response_t *rsp;
+    unsigned long idx;
+    
+    gnbd_t *gnbd = get_gnbd_by_fd(fd);
+    
+    if (gnbd == NULL) {
+        printf("GNBD badness: got poll hook on unknown device. (%d)\n", fd);
+        return -1;
+    }
+    gh = gnbd->gh;
+    err = gnbd_reply(gh);
+    switch (err) {
+    case GNBD_LOGIN_DONE:
+        if (gnbd->gh_state == GH_PROBEWAITING) {
+            req = (blkif_request_t *)&pending_list[gnbd->probe_idx].req;
+            printf("[!] Sending deferred PROBE!\n");
+            gnbd_blkif_probe(req, gnbd);
+            pending_list[gnbd->probe_idx].count = 0;
+            rsp = (blkif_response_t *)req;
+            blktap_inject_response(rsp);
+        }
+        gnbd->gh_state = GH_CONNECTED;
+        printf("GNBD_LOGIN_DONE (%d)\n", fd); 
+        break;
+
+    case GNBD_REQUEST_DONE: /* switch to idx */
+        idx = gnbd_finished_request(gh);
+        req = (blkif_request_t *)&pending_list[idx].req;
+        if ((idx > MAX_REQUESTS-1) || (pending_list[idx].count == 0)){
+            printf("gnbd returned a bad cookie (%lu)!\n", idx);
+            break;
+        }
+        
+        pending_list[idx].count--;
+        
+        if (pending_list[idx].count == 0) {
+            blkif_request_t tmp = *req;
+            pending_count--; /* dbg */
+            rsp = (blkif_response_t *)req;
+            rsp->id = tmp.id;
+            rsp->operation = tmp.operation;
+            rsp->status = BLKIF_RSP_OKAY;
+            blktap_inject_response(rsp);
+/*
+if (rsp->operation == BLKIF_OP_READ) {
+printf("[RD] > %lu (%d pndg)\n", (unsigned long)idx, pending_count);
+} else if (rsp->operation == BLKIF_OP_WRITE) {
+printf("[WR] > %lu (%d pndg)\n", (unsigned long)idx, pending_count);
+} else  {
+printf("[??] > %lu (%d pndg)\n", (unsigned long)idx, pending_count);
+}
+*/
+        }
+        break;
+        
+    case GNBD_CONTINUE:
+        break;
+        
+    case 0:
+        break;
+        
+    default:
+        printf("gnbd_reply error");
+        break;
+    }
+    return 0;
+}
+
+void gnbd_init(void)
+{   
+    int i;
+    
+    for (i = 0; i < MAX_DOMS; i++)
+        gnbds[i] = NULL;
+    
+    for (i = 0; i < MAX_REQUESTS; i++)
+        pending_list[i].count = 0; 
+    
+    printf("GNBD image plugin initialized\n");
+}
+
diff --git a/tools/blktap/blkgnbdlib.h b/tools/blktap/blkgnbdlib.h
new file mode 100644 (file)
index 0000000..b95d240
--- /dev/null
@@ -0,0 +1,16 @@
+/* blkgnbdlib.h
+ *
+ * gndb image-backed block device.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent.
+ */
+
+int gnbd_control(control_msg_t *msg);
+int gnbd_request(blkif_request_t *req);
+int gnbd_response(blkif_response_t *rsp); /* noop */
+void gnbd_init(void);
diff --git a/tools/blktap/blkimg.c b/tools/blktap/blkimg.c
new file mode 100644 (file)
index 0000000..fc746ad
--- /dev/null
@@ -0,0 +1,19 @@
+/* blkimg.c
+ *
+ * file-backed disk.
+ */
+
+#include "blktaplib.h"
+#include "blkimglib.h"
+
+
+int main(int argc, char *argv[])
+{
+    image_init();
+    
+    blktap_register_ctrl_hook("image_control", image_control);
+    blktap_register_request_hook("image_request", image_request);
+    blktap_listen();
+    
+    return 0;
+}
diff --git a/tools/blktap/blkimglib.c b/tools/blktap/blkimglib.c
new file mode 100644 (file)
index 0000000..075a2d9
--- /dev/null
@@ -0,0 +1,325 @@
+/* blkimglib.c
+ *
+ * file image-backed block device.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <db.h>       
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+#include "blktaplib.h"
+
+//#define TMP_IMAGE_FILE_NAME "/dev/sda1"
+#define TMP_IMAGE_FILE_NAME "fc3.image"
+
+#define MAX_DOMS        1024
+#define MAX_IMGNAME_LEN  255
+#define AMORFS_DEV     61440
+#define MAX_REQUESTS      64 /* must be synced with the blkif drivers. */
+#define SECTOR_SHIFT       9
+                                                                                
+#if 0
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+                                                                                
+
+typedef struct {
+    /* These need to turn into an array/rbtree for multi-disk support. */
+    FILE *img;
+    u64  fsid;
+    char imgname[MAX_IMGNAME_LEN];
+    blkif_vdev_t   vdevice;
+} image_t;
+
+image_t         *images[MAX_DOMS];
+blkif_request_t *reread_list[MAX_REQUESTS];
+
+int image_control(control_msg_t *msg)
+{
+    domid_t  domid;
+    DB      *db;
+    int      ret;
+    
+    if (msg->type != CMSG_BLKIF_BE) 
+    {
+        printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type);
+        return 0;
+    }
+    
+    switch(msg->subtype)
+    {
+    case CMSG_BLKIF_BE_CREATE:
+        if ( msg->length != sizeof(blkif_be_create_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n",
+                ((blkif_be_create_t *)msg->msg)->domid,
+                ((blkif_be_create_t *)msg->msg)->blkif_handle);
+        domid = ((blkif_be_create_t *)msg->msg)->domid;
+        if (images[domid] != NULL) {
+            printf("attempt to connect from an existing dom!\n");
+            return 0;
+        }
+        
+        images[domid] = (image_t *)malloc(sizeof(image_t));
+        if (images[domid] == NULL) {
+            printf("error allocating image record.\n");
+            return 0;
+        }
+        
+        images[domid]->img  = NULL;
+        images[domid]->fsid = 0;
+        
+        printf("Image connected.\n");
+        break;   
+        
+    case CMSG_BLKIF_BE_DESTROY:
+        if ( msg->length != sizeof(blkif_be_destroy_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n",
+                ((blkif_be_destroy_t *)msg->msg)->domid,
+                ((blkif_be_destroy_t *)msg->msg)->blkif_handle);
+        
+        domid = ((blkif_be_destroy_t *)msg->msg)->domid;
+        if (images[domid] != NULL) {
+            if (images[domid]->img != NULL)
+                fclose( images[domid]->img );
+            free( images[domid] );
+            images[domid] = NULL;
+        }
+        break;  
+    case CMSG_BLKIF_BE_VBD_GROW:
+    {
+        blkif_be_vbd_grow_t *grow;
+        
+        if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
+            goto parse_error;
+        printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_GROW(d:%d,h:%d,v:%d)\n",
+                ((blkif_be_vbd_grow_t *)msg->msg)->domid,
+                ((blkif_be_vbd_grow_t *)msg->msg)->blkif_handle,
+                ((blkif_be_vbd_grow_t *)msg->msg)->vdevice);
+        printf("              Extent: sec_start: %llu sec_len: %llu, dev: %d\n",
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_start,
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_length,
+                ((blkif_be_vbd_grow_t *)msg->msg)->extent.device);
+        grow = (blkif_be_vbd_grow_t *)msg->msg;
+        domid = grow->domid;
+        if (images[domid] == NULL) {
+            printf("VBD_GROW on unconnected domain!\n");
+            return 0;
+        }
+        
+        if (grow->extent.device != AMORFS_DEV) {
+            printf("VBD_GROW on non-amorfs device!\n");
+            return 0;
+        }
+        
+        /* TODO: config support for arbitrary image files/modes. */
+        sprintf(images[domid]->imgname, TMP_IMAGE_FILE_NAME);
+        
+        images[domid]->fsid   = grow->extent.sector_start;
+        images[domid]->vdevice = grow->vdevice; 
+        images[domid]->img = fopen64(TMP_IMAGE_FILE_NAME, "r+");
+        if (images[domid]->img == NULL) { 
+            printf("Couldn't open image file!\n");
+            return 0;
+        }
+        
+        printf("Image file opened. (%s)\n", images[domid]->imgname);
+        break;
+    }    
+    }
+    return 0;
+parse_error:
+    printf("Bad control message!\n");
+    return 0;
+    
+create_failed:
+    /* TODO: close the db ref. */
+    return 0;
+}    
+int image_request(blkif_request_t *req)
+{
+    FILE *img;
+    u64 sector;
+    char *spage, *dpage;
+    int ret, i, idx;
+    blkif_response_t *rsp;
+    domid_t dom = ID_TO_DOM(req->id);
+    
+    if ((images[dom] == NULL) || (images[dom]->img == NULL)) {
+        printf("Data request for unknown domain!!! %d\n", dom);
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = req->operation;
+        rsp->status = BLKIF_RSP_ERROR;
+        return BLKTAP_RESPOND;
+    }
+    
+    img = images[dom]->img;
+    
+    switch (req->operation) 
+    {
+    case BLKIF_OP_PROBE:
+    {
+        int fd;
+        struct stat stat;
+        vdisk_t *img_info;
+        
+        
+        /* We expect one buffer only. */
+        if ( req->nr_segments != 1 )
+            goto err;
+                                                                                
+        /* Make sure the buffer is page-sized. */
+        if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+             (blkif_last_sect (req->frame_and_sects[0]) != 7) )
+            goto err;
+
+        /* loop for multiple images would start here. */
+        
+        fd = fileno(img);
+        if (fd == -1) {
+            printf("Couldn't get image fd in PROBE!\n");
+            goto err;
+        }
+        
+        ret = fstat(fd, &stat);
+        if (ret != 0) {
+            printf("Couldn't stat image in PROBE!\n");
+            goto err;
+        }
+        
+        img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
+        img_info[0].device   = images[dom]->vdevice;
+        img_info[0].info     = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
+        img_info[0].capacity = (stat.st_size >> SECTOR_SHIFT);
+        
+        if (img_info[0].capacity == 0)
+            img_info[0].capacity = ((u64)1 << 63); // xend does this too.
+        
+        DPRINTF("iPROBE! device: 0x%04x capacity: %llu\n", img_info[0].device,
+                img_info[0].capacity);
+        
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = BLKIF_OP_PROBE;
+        rsp->status = 1; /* number of disks */
+        
+        return  BLKTAP_RESPOND;
+    }    
+    case BLKIF_OP_WRITE:
+    {
+        unsigned long size;
+        
+        for (i = 0; i < req->nr_segments; i++) {
+            
+            sector = req->sector_number + (8*i);
+            
+            size = blkif_last_sect (req->frame_and_sects[i]) -
+                   blkif_first_sect(req->frame_and_sects[i]) + 1;
+            
+            ret = fseeko64(img, (off_t)(sector << SECTOR_SHIFT), SEEK_SET);
+            if (ret != 0) {
+                printf("fseek error on WRITE\n");
+                goto err;
+            }
+            
+            DPRINTF("iWRITE: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", 
+                    req->sector_number, sector, 
+                    blkif_first_sect(req->frame_and_sects[i]),
+                    blkif_last_sect (req->frame_and_sects[i]),
+                    (long)(sector << SECTOR_SHIFT));
+                        
+            spage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+            spage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+            ret = fwrite(spage, size << SECTOR_SHIFT, 1, img);
+            if (ret != 1) {
+                printf("fwrite error on WRITE (%d)\n", errno);
+                goto err;
+            }
+        }
+        
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = BLKIF_OP_WRITE;
+        rsp->status = BLKIF_RSP_OKAY;
+        
+        return BLKTAP_RESPOND;
+    }
+    case BLKIF_OP_READ:
+    {
+        unsigned long size;
+        
+        for (i = 0; i < req->nr_segments; i++) {
+            
+            sector  = req->sector_number + (8*i);
+            
+            size = blkif_last_sect (req->frame_and_sects[i]) -
+                   blkif_first_sect(req->frame_and_sects[i]) + 1;
+            
+            ret = fseeko64(img, (off_t)(sector << SECTOR_SHIFT), SEEK_SET);
+            if (ret != 0) {
+                printf("fseek error on READ\n");
+                goto err;
+            }
+        
+            DPRINTF("iREAD : sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", 
+                    req->sector_number, sector, 
+                    blkif_first_sect(req->frame_and_sects[i]),
+                    blkif_last_sect (req->frame_and_sects[i]),
+                    (long)(sector << SECTOR_SHIFT));
+            
+            dpage  = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+            dpage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+            ret = fread(dpage, size << SECTOR_SHIFT, 1, img);
+            if (ret != 1) {
+                printf("fread error on READ\n");
+                goto err;
+            }
+        }
+
+        rsp = (blkif_response_t *)req;
+        rsp->id = req->id;
+        rsp->operation = BLKIF_OP_READ;
+        rsp->status = BLKIF_RSP_OKAY;
+        return BLKTAP_RESPOND;
+    }
+    }
+    
+    printf("Unknow block operation!\n");
+err:
+    rsp = (blkif_response_t *)req;
+    rsp->id = req->id;
+    rsp->operation = req->operation;
+    rsp->status = BLKIF_RSP_ERROR;
+    return BLKTAP_RESPOND;  
+}
+
+/* the image library terminates the request stream. _resp is a noop. */
+int image_response(blkif_response_t *rsp)
+{   
+    return BLKTAP_PASS;
+}
+
+void image_init(void)
+{
+    int i;
+    
+    for (i = 0; i < MAX_DOMS; i++)
+        images[i] = NULL;
+}
+
diff --git a/tools/blktap/blkimglib.h b/tools/blktap/blkimglib.h
new file mode 100644 (file)
index 0000000..1bc597f
--- /dev/null
@@ -0,0 +1,16 @@
+/* blkimglib.h
+ *
+ * file image-backed block device.
+ * 
+ * (c) 2004 Andrew Warfield.
+ *
+ * Xend has been modified to use an amorfs:[fsid] disk tag.
+ * This will show up as device type (maj:240,min:0) = 61440.
+ *
+ * The fsid is placed in the sec_start field of the disk extent.
+ */
+
+int image_control(control_msg_t *msg);
+int image_request(blkif_request_t *req);
+int image_response(blkif_response_t *rsp); /* noop */
+void image_init(void);
diff --git a/tools/blktap/blkint.h b/tools/blktap/blkint.h
new file mode 100644 (file)
index 0000000..e3ce3b5
--- /dev/null
@@ -0,0 +1,105 @@
+/*
+ * blkint.h
+ * 
+ * Interfaces for the Xen block interposition driver.
+ * 
+ * (c) 2004, Andrew Warfield, University of Cambridge
+ * 
+ */
+
+#ifndef __BLKINT_H__
+
+//#include "blkif.h"
+
+
+#if 0
+/* Types of ring. */
+#define BLKIF_REQ_RING_TYPE 1
+#define BLKIF_RSP_RING_TYPE 2
+
+/* generic ring struct. */
+typedef struct blkif_generic_ring_struct {
+    int type;
+} blkif_generic_ring_t;
+
+/* A requestor's view of a ring. */
+typedef struct blkif_req_ring_struct {
+
+    int type;                    /* Will be BLKIF_REQ_RING_TYPE        */
+    BLKIF_RING_IDX req_prod;     /* PRIVATE req_prod index             */
+    BLKIF_RING_IDX rsp_cons;     /* Response consumer index            */
+    blkif_ring_t *ring;          /* Pointer to shared ring struct      */
+
+} blkif_req_ring_t;
+
+#define BLKIF_REQ_RING_INIT { BLKIF_REQ_RING_TYPE, 0, 0, 0 }
+
+/* A responder's view of a ring. */
+typedef struct blkif_rsp_ring_struct {
+
+    int type;                    /* Will be BLKIF_REQ_RING_TYPE        */
+    BLKIF_RING_IDX rsp_prod;     /* PRIVATE rsp_prod index             */
+    BLKIF_RING_IDX req_cons;     /* Request consumer index             */
+    blkif_ring_t *ring;          /* Pointer to shared ring struct      */
+
+} blkif_rsp_ring_t;
+
+#define BLKIF_RSP_RING_INIT { BLKIF_RSP_RING_TYPE, 0, 0, 0 }
+
+#define RING(a) (blkif_generic_ring_t *)(a)
+inline int BLKTAP_RING_FULL(blkif_generic_ring_t *ring);
+#endif
+
+/* -------[ interposition -> character device interface ]------------- */
+
+/* /dev/xen/blktap resides at device number major=10, minor=202        */ 
+#define BLKTAP_MINOR 202
+
+/* size of the extra VMA area to map in attached pages. */
+#define BLKTAP_VMA_PAGES BLKIF_RING_SIZE
+
+/* blktap IOCTLs:                                                      */
+#define BLKTAP_IOCTL_KICK_FE         1
+#define BLKTAP_IOCTL_KICK_BE         2
+#define BLKTAP_IOCTL_SETMODE         3
+#define BLKTAP_IOCTL_PRINT_IDXS      100   
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
+#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
+#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE     0x00000002
+#define BLKTAP_MODE_COPY_FE          0x00000004
+#define BLKTAP_MODE_COPY_BE          0x00000008
+#define BLKTAP_MODE_COPY_FE_PAGES    0x00000010
+#define BLKTAP_MODE_COPY_BE_PAGES    0x00000020
+
+#define BLKTAP_MODE_INTERPOSE \
+           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+#define BLKTAP_MODE_COPY_BOTH \
+           (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE)
+
+#define BLKTAP_MODE_COPY_BOTH_PAGES \
+           (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES)
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+    return (
+        ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
+        ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+        ( arg == BLKTAP_MODE_INTERCEPT_BE ) ||
+        ( arg == BLKTAP_MODE_INTERPOSE    ) ||
+        ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) ||
+        ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
+        ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
+        );
+}
+
+
+
+
+
+
+
+#define __BLKINT_H__
+#endif
diff --git a/tools/blktap/blktaplib.c b/tools/blktap/blktaplib.c
new file mode 100644 (file)
index 0000000..2399a20
--- /dev/null
@@ -0,0 +1,542 @@
+/*
+ * blktaplib.c
+ * 
+ * userspace interface routines for the blktap driver.
+ *
+ * (c) 2004 Andrew Warfield.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#include <err.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <linux/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/poll.h>
+#include <sys/ioctl.h>
+#include <string.h>
+#include <unistd.h>
+                                                                     
+
+#define __COMPILING_BLKTAP_LIB
+#include "blktaplib.h"
+
+#if 1
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+#define DEBUG_RING_IDXS 1
+
+#define POLLRDNORM     0x040 
+
+#define BLKTAP_IOCTL_KICK 1
+
+// this is in the header now
+//DEFINE_RING_TYPES(blkif, blkif_request_t, blkif_response_t);
+
+void got_sig_bus();
+void got_sig_int();
+
+
+/* in kernel these are opposite, but we are a consumer now. */
+blkif_back_ring_t  fe_ring; /* slightly counterintuitive ;) */
+blkif_front_ring_t be_ring; 
+ctrl_back_ring_t   ctrl_ring;
+
+
+
+unsigned long mmap_vstart = 0;
+char *blktap_mem;
+int fd = 0;
+
+#define BLKTAP_RING_PAGES       3 /* Ctrl, Back, Front */
+/*#define BLKTAP_MMAP_PAGES       ((11 + 1) * 64)*/
+#define BLKTAP_MMAP_PAGES \
+    ((BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) * BLKIF_RING_SIZE)
+#define BLKTAP_MMAP_REGION_SIZE (BLKTAP_RING_PAGES + BLKTAP_MMAP_PAGES)
+
+
+    
+int bad_count = 0;
+void bad(void)
+{
+    bad_count ++;
+    if (bad_count > 50) exit(0);
+}
+/*-----[ ID Manipulation from tap driver code ]--------------------------*/
+
+#define ACTIVE_RING_IDX unsigned short
+
+inline unsigned long MAKE_ID(domid_t fe_dom, ACTIVE_RING_IDX idx)
+{
+    return ( (fe_dom << 16) | idx );
+}
+
+inline unsigned int ID_TO_IDX(unsigned long id) 
+{ 
+        return ( id & 0x0000ffff );
+}
+
+inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
+/*
+static int (*request_hook)(blkif_request_t *req) = NULL;
+static int (*response_hook)(blkif_response_t *req) = NULL;
+*/
+
+/*-----[ Request/Response hook chains.]----------------------------------*/
+
+#define HOOK_NAME_MAX 50
+        
+typedef struct ctrl_hook_st {
+    char name[HOOK_NAME_MAX];
+    int (*func)(control_msg_t *);
+    struct ctrl_hook_st *next;
+} ctrl_hook_t;
+        
+typedef struct request_hook_st {
+    char name[HOOK_NAME_MAX];
+    int (*func)(blkif_request_t *);
+    struct request_hook_st *next;
+} request_hook_t;
+
+typedef struct response_hook_st {
+    char name[HOOK_NAME_MAX];
+    int (*func)(blkif_response_t *);
+    struct response_hook_st *next;
+} response_hook_t;
+
+static ctrl_hook_t *ctrl_hook_chain = NULL;
+static request_hook_t *request_hook_chain = NULL;
+static response_hook_t *response_hook_chain = NULL;
+
+void blktap_register_ctrl_hook(char *name, int (*ch)(control_msg_t *)) 
+{
+    ctrl_hook_t *ch_ent, **c;
+    
+    ch_ent = (ctrl_hook_t *)malloc(sizeof(ctrl_hook_t));
+    if (!ch_ent) { printf("couldn't allocate a new hook\n"); exit(-1); }
+    
+    ch_ent->func  = ch;
+    ch_ent->next = NULL;
+    strncpy(ch_ent->name, name, HOOK_NAME_MAX);
+    ch_ent->name[HOOK_NAME_MAX-1] = '\0';
+    
+    c = &ctrl_hook_chain;
+    while (*c != NULL) {
+        c = &(*c)->next;
+    }
+    *c = ch_ent;
+}
+
+void blktap_register_request_hook(char *name, int (*rh)(blkif_request_t *)) 
+{
+    request_hook_t *rh_ent, **c;
+    
+    rh_ent = (request_hook_t *)malloc(sizeof(request_hook_t));
+    if (!rh_ent) { printf("couldn't allocate a new hook\n"); exit(-1); }
+    
+    rh_ent->func  = rh;
+    rh_ent->next = NULL;
+    strncpy(rh_ent->name, name, HOOK_NAME_MAX);
+    
+    c = &request_hook_chain;
+    while (*c != NULL) {
+        c = &(*c)->next;
+    }
+    *c = rh_ent;
+}
+
+void blktap_register_response_hook(char *name, int (*rh)(blkif_response_t *)) 
+{
+    response_hook_t *rh_ent, **c;
+    
+    rh_ent = (response_hook_t *)malloc(sizeof(response_hook_t));
+    if (!rh_ent) { printf("couldn't allocate a new hook\n"); exit(-1); }
+    
+    rh_ent->func  = rh;
+    rh_ent->next = NULL;
+    strncpy(rh_ent->name, name, HOOK_NAME_MAX);
+    
+    c = &response_hook_chain;
+    while (*c != NULL) {
+        c = &(*c)->next;
+    }
+    *c = rh_ent;
+}
+
+void print_hooks(void)
+{
+    request_hook_t  *req_hook;
+    response_hook_t *rsp_hook;
+    ctrl_hook_t     *ctrl_hook;
+    
+    printf("Control Hooks:\n");
+    ctrl_hook = ctrl_hook_chain;
+    while (ctrl_hook != NULL)
+    {
+        printf("  [0x%p] %s\n", ctrl_hook->func, ctrl_hook->name);
+        ctrl_hook = ctrl_hook->next;
+    }
+    
+    printf("Request Hooks:\n");
+    req_hook = request_hook_chain;
+    while (req_hook != NULL)
+    {
+        printf("  [0x%p] %s\n", req_hook->func, req_hook->name);
+        req_hook = req_hook->next;
+    }
+    
+    printf("Response Hooks:\n");
+    rsp_hook = response_hook_chain;
+    while (rsp_hook != NULL)
+    {
+        printf("  [0x%p] %s\n", rsp_hook->func, rsp_hook->name);
+        rsp_hook = rsp_hook->next;
+    }
+}
+        
+/*-----[ Data to/from Backend (server) VM ]------------------------------*/
+
+inline int write_req_to_be_ring(blkif_request_t *req)
+{
+    blkif_request_t *req_d;
+
+    //req_d = FRONT_RING_NEXT_EMPTY_REQUEST(&be_ring);
+    req_d = RING_GET_REQUEST(BLKIF_RING, &be_ring, be_ring.req_prod_pvt);
+    memcpy(req_d, req, sizeof(blkif_request_t));
+    wmb();
+    be_ring.req_prod_pvt++;
+    
+    return 0;
+}
+
+inline int write_rsp_to_fe_ring(blkif_response_t *rsp)
+{
+    blkif_response_t *rsp_d;
+
+    //rsp_d = BACK_RING_NEXT_EMPTY_RESPONSE(&fe_ring);
+    rsp_d = RING_GET_RESPONSE(BLKIF_RING, &fe_ring, fe_ring.rsp_prod_pvt);
+    memcpy(rsp_d, rsp, sizeof(blkif_response_t));
+    wmb();
+    fe_ring.rsp_prod_pvt++;
+
+    return 0;
+}
+
+static void apply_rsp_hooks(blkif_response_t *rsp)
+{
+    response_hook_t  *rsp_hook;
+    
+    rsp_hook = response_hook_chain;
+    while (rsp_hook != NULL)
+    {
+        switch(rsp_hook->func(rsp))
+        {
+        case BLKTAP_PASS:
+            break;
+        default:
+            printf("Only PASS is supported for resp hooks!\n");
+        }
+        rsp_hook = rsp_hook->next;
+    }
+}
+
+void blktap_inject_response(blkif_response_t *rsp)
+{
+    apply_rsp_hooks(rsp);
+    write_rsp_to_fe_ring(rsp);
+    RING_PUSH_RESPONSES(BLKIF_RING, &fe_ring);
+    ioctl(fd, BLKTAP_IOCTL_KICK_FE);
+}
+
+/*-----[ Polling fd listeners ]------------------------------------------*/
+
+#define MAX_POLLFDS 64
+
+typedef struct {
+    int (*func)(int fd);
+    struct pollfd *pfd;
+    int fd;
+    short events;
+    int active;
+} pollhook_t;
+
+static struct pollfd  pfd[MAX_POLLFDS+1];
+static pollhook_t     pollhooks[MAX_POLLFDS];
+static unsigned int   ph_freelist[MAX_POLLFDS];
+static unsigned int   ph_cons, ph_prod;
+#define nr_pollhooks() (MAX_POLLFDS - (ph_prod - ph_cons))
+#define PH_IDX(x) (x % MAX_POLLFDS)
+
+int blktap_attach_poll(int fd, short events, int (*func)(int fd))
+{
+    pollhook_t *ph;
+    
+    if (nr_pollhooks() == MAX_POLLFDS) {
+        printf("Too many pollhooks!\n");
+        return -1;
+    }
+    
+    ph = &pollhooks[ph_freelist[PH_IDX(ph_cons++)]];
+    
+    ph->func        = func;
+    ph->fd          = fd;
+    ph->events      = events;
+    ph->active      = 1;
+    
+    printf("Added fd %d at ph index %d, now %d phs.\n", fd, ph_cons-1, 
+            nr_pollhooks());
+    
+    return 0;
+}
+
+void blktap_detach_poll(int fd)
+{
+    int i;
+    
+    for (i=0; i<MAX_POLLFDS; i++)
+        if ((pollhooks[i].active) && (pollhooks[i].pfd->fd == fd)) {
+            ph_freelist[PH_IDX(ph_prod++)] = i;
+            pollhooks[i].pfd->fd = -1;
+            pollhooks[i].active = 0;
+            break;
+        }
+        
+    printf("Removed fd %d at ph index %d, now %d phs.\n", fd, i, 
+            nr_pollhooks());
+}
+
+void pollhook_init(void)
+{
+    int i;
+    
+    for (i=0; i < MAX_POLLFDS; i++) {
+        ph_freelist[i] = (i+1) % MAX_POLLFDS;
+        pollhooks[i].active = 0;
+    }
+    
+    ph_cons = 0;
+    ph_prod = MAX_POLLFDS;
+}
+
+void __attribute__ ((constructor)) blktaplib_init(void)
+{
+    printf("[[ C O N S T R U C T O R ]]\n");
+    pollhook_init();
+}
+
+/*-----[ The main listen loop ]------------------------------------------*/
+
+int blktap_listen(void)
+{
+    int               notify_be, notify_fe, tap_pfd;
+    
+    /* comms rings: */
+    blkif_request_t  *req;
+    blkif_response_t *rsp;
+    control_msg_t    *msg;
+    blkif_sring_t    *sring;
+    ctrl_sring_t     *csring;
+    RING_IDX          rp, i, pfd_count; 
+    
+    /* handler hooks: */
+    request_hook_t   *req_hook;
+    response_hook_t  *rsp_hook;
+    ctrl_hook_t      *ctrl_hook;
+    
+    signal (SIGBUS, got_sig_bus);
+    signal (SIGINT, got_sig_int);
+    
+    print_hooks();
+    
+    fd = open("/dev/blktap", O_RDWR);
+    if (fd == -1) {
+        printf("open failed! (%d)\n", errno);
+        goto open_failed;
+    }
+
+    blktap_mem = mmap(0, PAGE_SIZE * BLKTAP_MMAP_REGION_SIZE, 
+             PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+    if ((int)blktap_mem == -1) {
+        printf("mmap failed! (%d)\n", errno);
+        goto mmap_failed;
+    }
+
+    /* assign the rings to the mapped memory */
+    csring = (ctrl_sring_t *)blktap_mem;
+    BACK_RING_INIT(CTRL_RING, &ctrl_ring, csring);
+    
+    sring = (blkif_sring_t *)((unsigned long)blktap_mem + PAGE_SIZE);
+    FRONT_RING_INIT(BLKIF_RING, &be_ring, sring);
+    
+    sring = (blkif_sring_t *)((unsigned long)blktap_mem + (2 *PAGE_SIZE));
+    BACK_RING_INIT(BLKIF_RING, &fe_ring, sring);
+
+    mmap_vstart = (unsigned long)blktap_mem + (BLKTAP_RING_PAGES << PAGE_SHIFT);
+    
+    printf("fe_ring mapped at: %p\n", fe_ring.sring);
+    printf("be_ring mapped at: %p\n", be_ring.sring);
+
+    ioctl(fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
+
+    while(1) {
+        int ret;
+        
+        /* build the poll list */
+        
+        DPRINTF("Building poll list.\n");
+        
+        pfd_count = 0;
+        for ( i=0; i < MAX_POLLFDS; i++ ) {
+            pollhook_t *ph = &pollhooks[i];
+            
+            if (ph->active) {
+                pfd[pfd_count].fd     = ph->fd;
+                pfd[pfd_count].events = ph->events;
+                ph->pfd               = &pfd[pfd_count];
+                pfd_count++;
+            }
+        }
+
+        tap_pfd = pfd_count;
+        pfd[tap_pfd].fd = fd;
+        pfd[tap_pfd].events = POLLIN;
+
+        DPRINTF("poll() %d fds.\n", pfd_count);
+        
+        if ( (ret = (poll(pfd, pfd_count+1, 10000)) == 0) ) {
+            if (DEBUG_RING_IDXS)
+                ioctl(fd, BLKTAP_IOCTL_PRINT_IDXS);
+            continue;
+        }
+
+        DPRINTF("poll returned %d\n", ret);
+
+        for (i=0; i < MAX_POLLFDS; i++) {
+            if ( (pollhooks[i].active ) && (pollhooks[i].pfd->revents ) )
+                pollhooks[i].func(pollhooks[i].pfd->fd);
+        }
+        
+        if (pfd[tap_pfd].revents) {
+            
+            /* empty the control ring */
+            rp = ctrl_ring.sring->req_prod;
+            rmb();
+            for (i = ctrl_ring.req_cons; i < rp; i++)
+            {
+                msg = RING_GET_REQUEST(CTRL_RING, &ctrl_ring, i);
+
+                ctrl_hook = ctrl_hook_chain;
+                while (ctrl_hook != NULL)
+                {
+                    DPRINTF("CTRL_HOOK: %s\n", ctrl_hook->name);
+                    /* We currently don't respond to ctrl messages. */
+                    ctrl_hook->func(msg);
+                    ctrl_hook = ctrl_hook->next;
+                }
+            }
+            /* Using this as a unidirectional ring. */
+            ctrl_ring.req_cons = ctrl_ring.rsp_prod_pvt = i;
+            RING_PUSH_RESPONSES(CTRL_RING, &ctrl_ring);
+            
+            /* empty the fe_ring */
+            notify_fe = 0;
+            notify_be = RING_HAS_UNCONSUMED_REQUESTS(BLKIF_RING, &fe_ring);
+            rp = fe_ring.sring->req_prod;
+            rmb();
+            for (i = fe_ring.req_cons; i != rp; i++)
+            {
+                int done = 0; /* stop forwarding this request */
+
+                req = RING_GET_REQUEST(BLKIF_RING, &fe_ring, i);
+
+                DPRINTF("copying an fe request\n");
+
+                req_hook = request_hook_chain;
+                while (req_hook != NULL)
+                {
+                    DPRINTF("REQ_HOOK: %s\n", req_hook->name);
+                    switch(req_hook->func(req))
+                    {
+                    case BLKTAP_RESPOND:
+                        apply_rsp_hooks((blkif_response_t *)req);
+                        write_rsp_to_fe_ring((blkif_response_t *)req);
+                        notify_fe = 1;
+                        done = 1;
+                        break;
+                    case BLKTAP_STOLEN:
+                        done = 1;
+                        break;
+                    case BLKTAP_PASS:
+                        break;
+                    default:
+                        printf("Unknown request hook return value!\n");
+                    }
+                    if (done) break;
+                    req_hook = req_hook->next;
+                }
+
+                if (done == 0) write_req_to_be_ring(req);
+
+            }
+            fe_ring.req_cons = i;
+
+            /* empty the be_ring */
+            notify_fe |= RING_HAS_UNCONSUMED_RESPONSES(BLKIF_RING, &be_ring);
+            rp = be_ring.sring->rsp_prod;
+            rmb();
+            for (i = be_ring.rsp_cons; i != rp; i++)
+            {
+
+                rsp = RING_GET_RESPONSE(BLKIF_RING, &be_ring, i);
+
+                DPRINTF("copying a be request\n");
+
+                apply_rsp_hooks(rsp);
+                write_rsp_to_fe_ring(rsp);
+            }
+            be_ring.rsp_cons = i;
+
+            /* notify the domains */
+
+            if (notify_be) {
+                DPRINTF("notifying be\n");
+                RING_PUSH_REQUESTS(BLKIF_RING, &be_ring);
+                ioctl(fd, BLKTAP_IOCTL_KICK_BE);
+            }
+
+            if (notify_fe) {
+                DPRINTF("notifying fe\n");
+                RING_PUSH_RESPONSES(BLKIF_RING, &fe_ring);
+                ioctl(fd, BLKTAP_IOCTL_KICK_FE);
+            }
+        }        
+    }
+
+
+    munmap(blktap_mem, PAGE_SIZE);
+
+ mmap_failed:
+    close(fd);
+
+ open_failed:
+    return 0;
+}
+
+void got_sig_bus() {
+    printf("Attempted to access a page that isn't.\n");
+    exit(-1);
+}
+
+void got_sig_int() {
+    printf("quitting -- returning to passthrough mode.\n");
+    if (fd > 0) ioctl(fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_PASSTHROUGH );
+    exit(0);
+} 
diff --git a/tools/blktap/blktaplib.h b/tools/blktap/blktaplib.h
new file mode 100644 (file)
index 0000000..7b38f56
--- /dev/null
@@ -0,0 +1,76 @@
+/* blktaplib.h
+ *
+ * userland accessors to the block tap.
+ *
+ * for the moment this is rather simple.
+ */
+#ifndef __BLKTAPLIB_H__
+#define __BLKTAPLIB_H__
+
+#include <stdint.h>
+
+typedef uint8_t            u8;
+typedef uint16_t           u16;
+typedef uint32_t           u32;
+typedef uint64_t           u64;
+typedef int8_t             s8;
+typedef int16_t            s16;
+typedef int32_t            s32;
+typedef int64_t            s64;
+                                                                                
+#if defined(__i386__)
+#define rmb() __asm__ __volatile__ ( "lock; addl $0,0(%%esp)" : : : "memory" )
+#define wmb() __asm__ __volatile__ ( "" : : : "memory" )
+#else
+#error "Define barriers"
+#endif
+    
+#include <sys/user.h>
+#include <xen/xen.h>
+#include <xen/io/blkif.h>
+#include <xen/io/ring.h>
+#include <xen/io/domain_controller.h>
+#include "blkint.h"
+
+#define BLKTAP_PASS     0 /* Keep passing this request as normal. */
+#define BLKTAP_RESPOND  1 /* Request is now a reply.  Return it.  */
+#define BLKTAP_STOLEN   2 /* Hook has stolen request.             */
+
+#define domid_t unsigned short
+
+inline unsigned int ID_TO_IDX(unsigned long id);
+inline domid_t ID_TO_DOM(unsigned long id);
+
+void blktap_register_ctrl_hook(char *name, int (*ch)(control_msg_t *));
+void blktap_register_request_hook(char *name, int (*rh)(blkif_request_t *));
+void blktap_register_response_hook(char *name, int (*rh)(blkif_response_t *));
+void blktap_inject_response(blkif_response_t *);
+int  blktap_attach_poll(int fd, short events, int (*func)(int));
+void blktap_detach_poll(int fd);
+int  blktap_listen(void);
+
+/*-----[ Accessing attached data page mappings ]-------------------------*/
+#define MMAP_PAGES_PER_REQUEST \
+    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
+#define MMAP_VADDR(_req,_seg)                        \
+    (mmap_vstart +                                   \
+     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+     ((_seg) * PAGE_SIZE))
+
+extern unsigned long mmap_vstart;
+
+
+/*-----[ Defines that are only used by library clients ]-----------------*/
+
+#ifndef __COMPILING_BLKTAP_LIB
+
+static char *blkif_op_name[] = {
+    [BLKIF_OP_READ]       = "READ",
+    [BLKIF_OP_WRITE]      = "WRITE",
+    [BLKIF_OP_PROBE]      = "PROBE",
+};
+
+#endif /* __COMPILING_BLKTAP_LIB */
+    
+#endif /* __BLKTAPLIB_H__ */
diff --git a/tools/blktap/libgnbd/Makefile b/tools/blktap/libgnbd/Makefile
new file mode 100644 (file)
index 0000000..4297c02
--- /dev/null
@@ -0,0 +1,8 @@
+
+CFLAGS += -Wall -Werror -g
+LDFLAGS += -g
+
+libgnbd.a: libgnbd.o
+       $(AR) r $@ $<
+
+gnbdtest: gnbdtest.o libgnbd.a
diff --git a/tools/blktap/libgnbd/gnbdtest.c b/tools/blktap/libgnbd/gnbdtest.c
new file mode 100644 (file)
index 0000000..bc39159
--- /dev/null
@@ -0,0 +1,90 @@
+
+#include <err.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/poll.h>
+
+#include "libgnbd.h"
+
+#define PRINTF(x) printf x
+#if 0
+#define DFPRINTF(x...) fprintf(stderr, ##x)
+#define DPRINTF(x) DFPRINTF x
+#else
+#define DPRINTF(x)
+#endif
+
+static unsigned char buf1[8 << 9];
+static unsigned char buf2[8 << 9];
+static unsigned char buf3[8 << 9];
+
+int
+main(int argc, char **argv)
+{
+       struct gnbd_handle *gh;
+       struct pollfd pfd[1];
+       int err, tout;
+
+       gh = gnbd_setup("panik", 0x38e7, "cl349-nahant-beta2-root1",
+           "arcadians.cl.cam.ac.uk");
+       if (gh == NULL)
+               errx(1, "gnbd_setup");
+
+       memset(pfd, 0, sizeof(pfd));
+       pfd[0].fd = gnbd_fd(gh);
+       pfd[0].events = POLLIN;
+
+       while ((tout = poll(pfd, 1, 0)) >= 0) {
+               if (tout == 0)
+                       continue;
+               DPRINTF(("event\n"));
+               if (pfd[0].revents) {
+                       err = gnbd_reply(gh);
+                       pfd[0].events = POLLIN;
+                       switch (err) {
+                       case GNBD_LOGIN_DONE:
+                               DPRINTF(("sectors: %08llu\n",
+                                           gnbd_sectors(gh)));
+                               err = gnbd_read(gh, 8, 8, buf2, 1);
+                               if (err)
+                                       warnx("gnbd_read");
+                               err = gnbd_read(gh, 0, 8, buf1, 0);
+                               if (err)
+                                       warnx("gnbd_read");
+                               err = gnbd_read(gh, 16, 8, buf3, 2);
+                               if (err)
+                                       warnx("gnbd_read");
+                               break;
+                       case GNBD_REQUEST_DONE:
+                               DPRINTF(("request done %ld\n",
+                                           gnbd_finished_request(gh)));
+                               if (0 && gnbd_finished_request(gh) == 0) {
+                                       write(1, buf1, 8 << 9);
+                                       err = gnbd_write(gh, 0, 8, buf1, 10);
+                                       if (err)
+                                               warnx("gnbd_write");
+                               }
+                               break;
+                       case GNBD_CONTINUE:
+                               DPRINTF(("continue\n"));
+                               break;
+                       case 0:
+                               break;
+                       case GNBD_CONTINUE_WRITE:
+                               DPRINTF(("continue write\n"));
+                               pfd[0].events |= POLLOUT;
+                               break;
+                       default:
+                               warnx("gnbd_reply error");
+                               break;
+                       }
+                       DPRINTF(("got gnbd reply\n"));
+               }
+       }
+
+       return 0;
+}
diff --git a/tools/blktap/libgnbd/libgnbd.c b/tools/blktap/libgnbd/libgnbd.c
new file mode 100644 (file)
index 0000000..2856ca3
--- /dev/null
@@ -0,0 +1,647 @@
+/* libgnbd.c
+ * 
+ * gnbd client library
+ *
+ * Copyright (c) 2005, Christian Limpach
+ */
+  
+#include <byteswap.h>
+#include <endian.h>
+#include <err.h>
+#include <errno.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+
+#include <stdio.h>
+
+#include "libgnbd.h"
+
+#define        PROTOCOL_VERSION        2
+
+#define        EXTERN_KILL_GSERV_REQ   5
+#define        EXTERN_LOGIN_REQ        6
+
+#define        GNBD_REQUEST_MAGIC      0x37a07e00
+#define        GNBD_KEEP_ALIVE_MAGIC   0x5b46d8c2
+#define        GNBD_REPLY_MAGIC        0x41f09370
+
+enum {
+       GNBD_CMD_READ = 0,
+       GNBD_CMD_WRITE = 1,
+       GNBD_CMD_DISC = 2,
+       GNBD_CMD_PING = 3
+};
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define htonll(x) (x)
+#define ntohll(x) (x)
+#endif
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define htonll(x) bswap_64(x)
+#define ntohll(x) bswap_64(x)
+#endif
+
+#define PRINTF(x) printf x
+#if 0
+#define DFPRINTF(x...) fprintf(stderr, ##x)
+#define DPRINTF(x) DFPRINTF x
+#else
+#define DPRINTF(x)
+#endif
+
+struct gnbd_request {
+       struct gnbd_request     *gr_next;
+       unsigned char           *gr_buf;
+       ssize_t                 gr_size;
+       ssize_t                 gr_done;
+       unsigned long           gr_cookie;
+};
+
+struct gnbd_handle {
+       int                     gh_fd;
+       unsigned int            gh_flags;
+       uint64_t                gh_sectors;
+       char                    gh_devname[32];
+       char                    gh_nodename[65];
+       struct sockaddr_in      gh_sin;
+       struct gnbd_request     *gh_outstanding_requests;
+       struct gnbd_request     **gh_outstanding_requests_last;
+       struct gnbd_request     *gh_incoming_request;
+       unsigned long           gh_finished_request;
+};
+#define        GHF_EXPECT_KILL_GSERV_REPLY     0x0001
+#define        GHF_EXPECT_LOGIN_REPLY          0x0002
+#define        GHF_INCOMING_REQUEST            0x0004
+
+struct device_req {
+       char            name[32];
+};
+
+struct node_req {
+       char            node_name[65];
+};
+
+struct login_req {
+        uint64_t       timestamp;
+        uint16_t       version;
+        uint8_t                pad[6];
+        char           devname[32];
+};
+
+struct login_reply {
+        uint64_t       sectors;
+        uint16_t       version;
+        uint8_t                err;
+        uint8_t                pad[5];
+};
+
+struct gnbd_server_request {
+       uint32_t        magic;
+       uint32_t        type;
+       char            handle[8];
+       uint64_t        from;
+       uint32_t        len;
+} __attribute__ ((packed));
+
+struct gnbd_server_reply {
+       uint32_t        magic;
+       uint32_t        error;
+       char            handle[8];
+} __attribute__ ((packed));
+
+static int
+read_buf(int fd, void *buf, size_t count, size_t *read_count)
+{
+       int err;
+
+       err = read(fd, buf, count);
+       if (read_count) {
+               if (err >= 0)
+                       *read_count = err;
+       } else if (err != count)
+               return EINTR;   /* xxx */
+       return err < 0;
+}
+
+static int
+read_4(int fd, unsigned long *val)
+{
+       unsigned long buf;
+       int err;
+
+       err = read_buf(fd, &buf, sizeof(buf), NULL);
+       if (err == 0)
+               *val = ntohl(buf);
+       return err;
+}
+
+static int
+write_buf(int fd, void *buf, size_t count)
+{
+       int err;
+
+       err = write(fd, buf, count);
+       return err < 0;
+}
+
+static int
+write_4(int fd, unsigned long val)
+{
+       unsigned long buf;
+       int err;
+
+       buf = htonl(val);
+       err = write_buf(fd, &buf, sizeof(buf));
+       return err;
+}
+
+
+static int
+socket_connect(struct gnbd_handle *gh)
+{
+       int err;
+
+       if (gh->gh_fd >= 0)
+               return 0;
+
+       gh->gh_fd = socket(PF_INET, SOCK_STREAM, 0);
+       if (gh->gh_fd < 0) {
+               warn("socket");
+               return gh->gh_fd;
+       }
+
+       err = connect(gh->gh_fd, (struct sockaddr *)&gh->gh_sin,
+           sizeof(gh->gh_sin));
+       if (err) {
+               warn("connect");
+               goto out;
+       }
+
+       return 0;
+ out:
+       close (gh->gh_fd);
+       gh->gh_fd = -1;
+       return err;
+}
+
+static int
+socket_shutdown(struct gnbd_handle *gh)
+{
+
+       close (gh->gh_fd);
+       gh->gh_fd = -1;
+       return 0;
+}
+
+static int
+find_request(struct gnbd_handle *gh, struct gnbd_request *gr)
+{
+       struct gnbd_request **tmp;
+
+       for (tmp = &gh->gh_outstanding_requests; *tmp;
+            tmp = &(*tmp)->gr_next) {
+               if (*tmp == gr) {
+                       *tmp = (*tmp)->gr_next;
+                       if (*tmp == NULL)
+                               gh->gh_outstanding_requests_last = tmp;
+                       return 0;
+               }
+       }
+       return ENOENT;
+}
+
+static int
+kill_gserv(struct gnbd_handle *gh)
+{
+       struct device_req dr;
+       struct node_req nr;
+       int err;
+
+       DPRINTF(("gnbd_kill_gserv\n"));
+       err = socket_connect(gh);
+       if (err) {
+               warnx("socket_connect");
+               return err;
+       }
+
+       err = write_4(gh->gh_fd, EXTERN_KILL_GSERV_REQ);
+       if (err) {
+               warnx("send EXTERN_LOGIN_REQ failed");
+               goto out;
+       }
+
+       strncpy(dr.name, gh->gh_devname, sizeof(dr.name));
+       err = write_buf(gh->gh_fd, &dr, sizeof(dr));
+       if (err) {
+               warnx("send device_req failed");
+               goto out;
+       }
+
+       strncpy(nr.node_name, gh->gh_nodename, sizeof(nr.node_name));
+       err = write_buf(gh->gh_fd, &nr, sizeof(nr));
+       if (err) {
+               warnx("send node_req failed");
+               goto out;
+       }
+
+       gh->gh_flags |= GHF_EXPECT_KILL_GSERV_REPLY;
+       DPRINTF(("gnbd_kill_gserv ok\n"));
+
+       return 0;
+ out:
+       socket_shutdown(gh);
+       return err;
+}
+
+static int
+login(struct gnbd_handle *gh)
+{
+       struct login_req lr;
+       struct node_req nr;
+       int err;
+       uint64_t timestamp;
+       struct timeval tv;
+
+       DPRINTF(("gnbd_login\n"));
+       err = socket_connect(gh);
+       if (err) {
+               warnx("socket_connect");
+               return err;
+       }
+
+       err = write_4(gh->gh_fd, EXTERN_LOGIN_REQ);
+       if (err) {
+               warnx("send EXTERN_LOGIN_REQ failed");
+               goto out;
+       }
+
+       err = gettimeofday(&tv, NULL);
+       if (err) {
+               warnx("gettimeofday");
+               goto out;
+       }
+       timestamp = (uint64_t)tv.tv_sec * 1000000 + tv.tv_usec;
+
+       lr.timestamp = htonll(timestamp);
+       lr.version = htons(PROTOCOL_VERSION);
+       strncpy(lr.devname, gh->gh_devname, sizeof(lr.devname));
+       err = write_buf(gh->gh_fd, &lr, sizeof(lr));
+       if (err) {
+               warnx("send login_req failed");
+               goto out;
+       }
+
+       strncpy(nr.node_name, gh->gh_nodename, sizeof(nr.node_name));
+       err = write_buf(gh->gh_fd, &nr, sizeof(nr));
+       if (err) {
+               warnx("send node_req failed");
+               goto out;
+       }
+
+       gh->gh_flags |= GHF_EXPECT_LOGIN_REPLY;
+
+       DPRINTF(("gnbd_login ok\n"));
+       return 0;
+ out:
+       socket_shutdown(gh);
+       return err;
+}
+
+static int
+kill_gserv_reply(struct gnbd_handle *gh)
+{
+       unsigned long reply;
+       int err;
+
+       DPRINTF(("read gnbd_kill_gserv_reply\n"));
+       err = read_4(gh->gh_fd, &reply);
+       if (err) {
+               warnx("read kill_gserv_reply failed");
+               return err;
+       }
+
+       if (reply && reply != ENODEV) {
+               warnx("kill gserv failed: %s", strerror(reply));
+               return reply;
+       }
+
+       gh->gh_flags &= ~GHF_EXPECT_KILL_GSERV_REPLY;
+       socket_shutdown(gh);
+
+       err = login(gh);
+       if (err)
+               warnx("gnbd_login");
+
+       return err;
+}
+
+static int
+login_reply(struct gnbd_handle *gh)
+{
+       struct login_reply lr;
+       int err;
+
+       DPRINTF(("read gnbd_login_reply\n"));
+       err = read_buf(gh->gh_fd, &lr, sizeof(lr), NULL);
+       if (err) {
+               warnx("read login_reply failed");
+               return err;
+       }
+
+       if (lr.err) {
+               if (lr.version) {
+                       warnx("gnbd version mismatch %04x != %04x",
+                           PROTOCOL_VERSION, ntohs(lr.version));
+                       return EINVAL;
+               }
+               warnx("login refused: %s", strerror(lr.err));
+               return lr.err;
+       }
+       gh->gh_sectors = ntohll(lr.sectors);
+
+       gh->gh_flags &= ~GHF_EXPECT_LOGIN_REPLY;
+
+       return GNBD_LOGIN_DONE;
+}
+
+static int
+incoming_request(struct gnbd_handle *gh)
+{
+       struct gnbd_request *gr = gh->gh_incoming_request;
+       ssize_t done;
+       int err;
+
+       DPRINTF(("incoming_request: done %d size %d\n", gr->gr_done,
+                   gr->gr_size));
+       err = read_buf(gh->gh_fd, gr->gr_buf + gr->gr_done,
+           gr->gr_size - gr->gr_done, &done);
+       if (err)
+               goto out;
+
+       DPRINTF(("incoming_request: got %d\n", done));
+       gr->gr_done += done;
+       if (gr->gr_done == gr->gr_size) {
+               gh->gh_flags &= ~GHF_INCOMING_REQUEST;
+               gh->gh_finished_request = gr->gr_cookie;
+               free(gr);
+               return GNBD_REQUEST_DONE;
+       }
+
+       return GNBD_CONTINUE;
+
+ out:
+       gh->gh_flags &= ~GHF_INCOMING_REQUEST;
+       gh->gh_finished_request = 0;
+       free(gr);
+       return err;
+}
+
+
+
+int
+gnbd_close(struct gnbd_handle *gh)
+{
+       int err;
+       struct gnbd_request **tmp;
+
+       for (tmp = &gh->gh_outstanding_requests; *tmp; tmp = &(*tmp)->gr_next)
+               free(*tmp);
+
+       if (gh->gh_flags & GHF_INCOMING_REQUEST)
+               free(gh->gh_incoming_request);
+
+       err = close(gh->gh_fd);
+       if (err)
+               warnx("close");
+       free(gh);
+
+       return err;
+}
+
+int
+gnbd_fd(struct gnbd_handle *gh)
+{
+       return gh->gh_fd;
+}
+
+unsigned long
+gnbd_finished_request(struct gnbd_handle *gh)
+{
+       return gh->gh_finished_request;
+}
+
+int
+gnbd_read(struct gnbd_handle *gh, uint64_t sector, ssize_t count,
+    unsigned char *buf, unsigned long cookie)
+{
+       struct gnbd_server_request gsr;
+       struct gnbd_request *gr;
+       int err;
+
+       gr = malloc(sizeof(struct gnbd_request));
+       if (gr == NULL)
+               return ENOMEM;
+       memset(gr, 0, sizeof(gr));
+
+       gr->gr_buf = buf;
+       gr->gr_size = count << 9;
+       gr->gr_done = 0;
+       gr->gr_cookie = cookie;
+
+       gsr.magic = htonl(GNBD_REQUEST_MAGIC);
+       gsr.type = htonl(GNBD_CMD_READ);
+       gsr.from = htonll(sector << 9);
+       gsr.len = htonl(gr->gr_size);
+       memset(gsr.handle, 0, sizeof(gsr.handle));
+       memcpy(gsr.handle, &gr, sizeof(gr));
+
+       err = write_buf(gh->gh_fd, &gsr, sizeof(gsr));
+       if (err) {
+               warnx("write_buf");
+               goto out;
+       }
+
+       *gh->gh_outstanding_requests_last = gr;
+       gh->gh_outstanding_requests_last = &gr->gr_next;
+
+       return 0;
+
+ out:
+       free(gr);
+       return err;
+}
+
+int
+gnbd_write(struct gnbd_handle *gh, uint64_t sector, ssize_t count,
+    unsigned char *buf, unsigned long cookie)
+{
+       struct gnbd_server_request gsr;
+       struct gnbd_request *gr;
+       int err;
+
+       gr = malloc(sizeof(struct gnbd_request));
+       if (gr == NULL)
+               return ENOMEM;
+       memset(gr, 0, sizeof(gr));
+
+       gr->gr_buf = buf;
+       gr->gr_size = count << 9;
+       gr->gr_done = 0;
+       gr->gr_cookie = cookie;
+
+       gsr.magic = htonl(GNBD_REQUEST_MAGIC);
+       gsr.type = htonl(GNBD_CMD_WRITE);
+       gsr.from = htonll(sector << 9);
+       gsr.len = htonl(gr->gr_size);
+       memset(gsr.handle, 0, sizeof(gsr.handle));
+       memcpy(gsr.handle, &gr, sizeof(gr));
+
+       err = write_buf(gh->gh_fd, &gsr, sizeof(gsr));
+       if (err) {
+               warnx("write_buf");
+               goto out;
+       }
+
+       /* XXX handle non-blocking socket */
+       err = write_buf(gh->gh_fd, buf, gr->gr_size);
+       if (err) {
+               warnx("write_buf");
+               goto out;
+       }
+       gr->gr_done += gr->gr_size;
+
+       *gh->gh_outstanding_requests_last = gr;
+       gh->gh_outstanding_requests_last = &gr->gr_next;
+
+       DPRINTF(("write done\n"));
+
+       return 0;
+
+ out:
+       free(gr);
+       return err;
+}
+
+int
+gnbd_reply(struct gnbd_handle *gh)
+{
+       struct gnbd_server_reply gsr;
+       struct gnbd_request *gr;
+       int err;
+
+       DPRINTF(("gnbd_reply flags %x\n", gh->gh_flags));
+       if ((gh->gh_flags & GHF_EXPECT_KILL_GSERV_REPLY))
+               return kill_gserv_reply(gh);
+       if ((gh->gh_flags & GHF_EXPECT_LOGIN_REPLY))
+               return login_reply(gh);
+       if ((gh->gh_flags & GHF_INCOMING_REQUEST))
+               return incoming_request(gh);
+
+       DPRINTF(("read response\n"));
+       err = read_buf(gh->gh_fd, &gsr, sizeof(gsr), NULL);
+       if (err) {
+               warnx("read gnbd_reply failed");
+               return err;
+       }
+
+       if (ntohl(gsr.error)) {
+               warnx("gnbd server reply error: %s", strerror(gsr.error));
+               return gsr.error;
+       }
+
+       switch (ntohl(gsr.magic)) {
+       case GNBD_KEEP_ALIVE_MAGIC:
+               DPRINTF(("read keep alive magic\n"));
+               return GNBD_CONTINUE;
+       case GNBD_REPLY_MAGIC:
+               DPRINTF(("read reply magic\n"));
+               memcpy(&gr, gsr.handle, sizeof(gr));
+               err = find_request(gh, gr);
+               if (err) {
+                       warnx("unknown request");
+                       return err;
+               }
+               if (gr->gr_done != gr->gr_size) {
+                       gh->gh_incoming_request = gr;
+                       gh->gh_flags |= GHF_INCOMING_REQUEST;
+                       return GNBD_CONTINUE;
+               } else {
+                       gh->gh_finished_request = gr->gr_cookie;
+                       free(gr);
+                       return GNBD_REQUEST_DONE;
+               }
+       default:
+               break;
+       }
+
+       return GNBD_CONTINUE;
+}
+
+uint64_t
+gnbd_sectors(struct gnbd_handle *gh)
+{
+
+       return gh->gh_sectors;
+}
+
+struct gnbd_handle *
+gnbd_setup(char *server, unsigned int port, char *devname, char *nodename)
+{
+       struct gnbd_handle *gh;
+       struct addrinfo *res, *ai;
+       int err;
+
+       gh = malloc(sizeof(struct gnbd_handle));
+       if (gh == NULL)
+               return NULL;
+       memset(gh, 0, sizeof(gh));
+       gh->gh_fd = -1;
+       gh->gh_outstanding_requests_last = &gh->gh_outstanding_requests;
+
+       strncpy(gh->gh_devname, devname, sizeof(gh->gh_devname));
+       strncpy(gh->gh_nodename, nodename, sizeof(gh->gh_nodename));
+
+       err = getaddrinfo(server, NULL, NULL, &res);
+       if (err) {
+               if (err != EAI_SYSTEM)
+                       warnx("getaddrinfo: %s", gai_strerror(err));
+               else
+                       warn("getaddrinfo: %s", gai_strerror(err));
+               goto out;
+       }
+
+       for (ai = res; ai; ai = ai->ai_next) {
+               if (ai->ai_socktype != SOCK_STREAM)
+                       continue;
+               if (ai->ai_family == AF_INET)
+                       break;
+       }
+
+       if (ai == NULL)
+               goto out;
+
+       gh->gh_sin.sin_family = ai->ai_family;
+       gh->gh_sin.sin_port = htons(port);
+       memcpy(&gh->gh_sin.sin_addr,
+           &((struct sockaddr_in *)ai->ai_addr)->sin_addr,
+           sizeof(gh->gh_sin.sin_addr));
+
+       err = kill_gserv(gh);
+       if (err) {
+               warnx("gnbd_kill_gserv");
+               goto out;
+       }
+
+       freeaddrinfo(res);
+       return gh;
+ out:
+       free(gh);
+       freeaddrinfo(res);
+       return NULL;
+}
diff --git a/tools/blktap/libgnbd/libgnbd.h b/tools/blktap/libgnbd/libgnbd.h
new file mode 100644 (file)
index 0000000..9fb3dbb
--- /dev/null
@@ -0,0 +1,25 @@
+/* libgnbd.h
+ *
+ * gnbd client library
+ *
+ * Copyright (c) 2005, Christian Limpach
+ */
+     
+#define GNBD_LOGIN_DONE                0x10001
+#define GNBD_REQUEST_DONE      0x10002
+#define GNBD_CONTINUE          0x10003
+#define GNBD_CONTINUE_WRITE    0x10004
+
+struct gnbd_handle;
+int gnbd_close(struct gnbd_handle *);
+int gnbd_fd(struct gnbd_handle *);
+unsigned long gnbd_finished_request(struct gnbd_handle *);
+int gnbd_kill_gserv(struct gnbd_handle *);
+int gnbd_login(struct gnbd_handle *);
+int gnbd_read(struct gnbd_handle *, uint64_t, ssize_t, unsigned char *,
+    unsigned long);
+int gnbd_write(struct gnbd_handle *, uint64_t, ssize_t, unsigned char *,
+    unsigned long);
+int gnbd_reply(struct gnbd_handle *);
+uint64_t gnbd_sectors(struct gnbd_handle *);
+struct gnbd_handle *gnbd_setup(char *, unsigned int, char *, char *);
index 8a1de09bfc0a8743064a94738ada6e2a2d2dd2e4..f40f7d5b6be4bfffbbca99fc8f42646ba79e5cba 100755 (executable)
@@ -363,6 +363,20 @@ class BlkDev(controller.SplitDev):
         Blkctl.block('unbind', self.type, self.node)
 
     def setNode(self, node):
+    
+        # NOTE: 
+        # This clause is testing code for storage system experiments.
+        # Add a new disk type that will just pass an opaque id in the
+        # start_sector and use an experimental device type.
+        # Please contact andrew.warfield@cl.cam.ac.uk with any concerns.
+        if self.type == 'amorfs':
+            self.node   = node
+            self.device =  61440 # (240,0)
+            self.start_sector = long(self.params)
+            self.nr_sectors = long(0)
+            return
+        # done.
+            
         mounted_mode = check_mounted(self, node)
         if not '!' in self.mode and mounted_mode:
             if mounted_mode is "w":
index 4108f4e545776607149fbb02036d00fd6320843f..35b1b78f84bd984692e5ecce11e75ab51ca12921 100644 (file)
@@ -9,7 +9,7 @@
 #ifndef __XEN_PUBLIC_IO_BLKIF_H__
 #define __XEN_PUBLIC_IO_BLKIF_H__
 
-#include <asm-xen/xen-public/io/ring.h>
+#include "ring.h"
 
 #define blkif_vdev_t   u16
 #define blkif_sector_t u64